less-norm

This commit is contained in:
2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions

View File

@@ -48,40 +48,42 @@ async def messages_routing(msg, state):
reply_to_msg_id = reply_msg.get("message_id") reply_to_msg_id = reply_msg.get("message_id")
if not reply_to_msg_id and latest_toxic_message_id: if not reply_to_msg_id and latest_toxic_message_id:
reply_to_msg_id = int(latest_toxic_message_id) reply_to_msg_id = int(latest_toxic_message_id)
# count toxicity
if reply_to_msg_id:
# count one message score
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
reply_text = ""
if one_score:
logger.debug(one_score)
reply_text += f"{int(one_score)}% токсичности\n"
# count average between all of messages # count average between all of messages
toxic_pattern = f"toxic:{cid}:{uid}:*" toxic_pattern = f"toxic:{cid}:{uid}:*"
toxic_score = await get_average_pattern(toxic_pattern) toxic_score = await get_average_pattern(toxic_pattern)
# current mesasage toxicity if toxic_score:
if reply_to_msg_id: emoji = (
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}") "😳"
reply_text = "" if toxic_score > 90
if one_score: else "😟"
logger.debug(one_score) if toxic_score > 80
reply_text += f"{int(one_score)}% токсичности\n" else "😏"
if toxic_score: if toxic_score > 60
emoji = ( else "🙂"
"😳" if toxic_score > 20
if toxic_score > 90 else "😇"
else "😟" )
if toxic_score > 80 reply_text += (
else "😏" f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
if toxic_score > 60 )
else "🙂" if reply_text:
if toxic_score > 20 await telegram_api(
else "😇" "sendMessage",
) chat_id=cid,
reply_text += ( reply_to_message_id=reply_to_msg_id,
f"Средняя токсичность сообщений: {toxic_score}% {emoji}" text=reply_text,
) )
if reply_text:
await telegram_api(
"sendMessage",
chat_id=cid,
reply_to_message_id=reply_to_msg_id,
text=reply_text,
)
try: try:
await telegram_api("deleteMessage", chat_id=cid, message_id=mid) await telegram_api("deleteMessage", chat_id=cid, message_id=mid)
except Exception: except Exception:

View File

@@ -1,13 +1,7 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
import logging import logging
logger = logging.getLogger("nlp.normalize") logger = logging.getLogger("nlp.normalize")
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text): def is_russian_wording(text):
""" """
@@ -22,24 +16,6 @@ def is_russian_wording(text):
return True return True
return False return False
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text
def normalize(text): def normalize(text):
""" """
Normalize English text to resemble Russian characters. Normalize English text to resemble Russian characters.

25
nlp/segment_text.py Normal file
View File

@@ -0,0 +1,25 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text

View File

@@ -2,5 +2,5 @@ redis[hiredis]
aiohttp aiohttp
torch torch
transformers transformers
protobuf # protobuf
sentencepiece # sentencepiece