less-norm

2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@@ -48,40 +48,42 @@ async def messages_routing(msg, state):
                reply_to_msg_id = reply_msg.get("message_id")
                if not reply_to_msg_id and latest_toxic_message_id:
                    reply_to_msg_id = int(latest_toxic_message_id)
                # count toxicity
                if reply_to_msg_id:
                    # count one message score
                    one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
                    reply_text = ""
                    if one_score:
                        logger.debug(one_score)
                        reply_text += f"{int(one_score)}% токсичности\n"
-            # count average between all of messages
+                    # count average between all of messages
-            toxic_pattern = f"toxic:{cid}:{uid}:*"
+                    toxic_pattern = f"toxic:{cid}:{uid}:*"
-            toxic_score = await get_average_pattern(toxic_pattern)
+                    toxic_score = await get_average_pattern(toxic_pattern)
-            # current mesasage toxicity
+                    if toxic_score:
-            if reply_to_msg_id:
+                        emoji = (
-                one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
+                            "😳"
-                reply_text = ""
+                            if toxic_score > 90
-                if one_score:
+                            else "😟"
-                    logger.debug(one_score)
+                            if toxic_score > 80
-                    reply_text += f"{int(one_score)}% токсичности\n"
+                            else "😏"
-                if toxic_score:
+                            if toxic_score > 60
-                    emoji = (
+                            else "🙂"
-                        "😳"
+                            if toxic_score > 20
-                        if toxic_score > 90
+                            else "😇"
-                        else "😟"
+                        )
-                        if toxic_score > 80
+                        reply_text += (
-                        else "😏"
+                            f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
-                        if toxic_score > 60
+                        )
-                        else "🙂"
+                    if reply_text:
-                        if toxic_score > 20
+                        await telegram_api(
-                        else "😇"
+                            "sendMessage",
-                    )
+                            chat_id=cid,
-                    reply_text += (
+                            reply_to_message_id=reply_to_msg_id,
-                        f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
+                            text=reply_text,
-                    )
+                        )
                if reply_text:
                    await telegram_api(
                        "sendMessage",
                        chat_id=cid,
                        reply_to_message_id=reply_to_msg_id,
                        text=reply_text,
                    )
            try:
                await telegram_api("deleteMessage", chat_id=cid, message_id=mid)
            except Exception:
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@@ -1,13 +1,7 @@
 import torch
 from transformers import ByT5Tokenizer, T5ForConditionalGeneration
 import logging
 logger = logging.getLogger("nlp.normalize")
 # Use ByT5 for the ByT5 model
 tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
 model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
 def is_russian_wording(text):
    """
@@ -22,24 +16,6 @@ def is_russian_wording(text):
                return True
    return False
 def segment_text(text):
    """
    Use a neural network model to segment text into words.
    """
    # Encode the input text for the model as UTF-8 bytes
    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(inputs)
    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return segmented_text
 def normalize(text):
    """
    Normalize English text to resemble Russian characters.
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@@ -0,0 +1,25 @@
 import torch
 from transformers import ByT5Tokenizer, T5ForConditionalGeneration
 # Use ByT5 for the ByT5 model
 tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
 model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
 def segment_text(text):
    """
    Use a neural network model to segment text into words.
    """
    # Encode the input text for the model as UTF-8 bytes
    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(inputs)
    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return segmented_text
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,5 @@ redis[hiredis]
 aiohttp
 torch
 transformers
-protobuf
+# protobuf
-sentencepiece
+# sentencepiece