words_only

2025-11-22 19:54:45 +08:00
parent 4db1b87c81
commit 9a8b240e4e
1 changed files with 20 additions and 1 deletions
--- a/lang_agent/utils.py
+++ b/lang_agent/utils.py
@@ -1,5 +1,6 @@
 from langchain.chat_models import init_chat_model
 from langchain_core.language_models import BaseChatModel
 import re
 import os
 from dotenv import load_dotenv
@@ -18,4 +19,22 @@ def make_llm(model="qwen-plus",
                          base_url=base_url,
                          **kwargs)
-    return llm
+    return llm
 NON_WORD_PATTERN = re.compile(r'[^\u4e00-\u9fffA-Za-z0-9_\s]')
 def words_only(text):
    """
    Keep only:
        - Chinese characters (U+4E00–U+9FFF)
        - Latin letters, digits, underscore
        - Whitespace (as separators)
    Strip punctuation, emojis, etc.
    Return a list of tokens (Chinese blocks or Latin word blocks).
    """
    # 1. Replace all non-allowed characters with a space
    cleaned = NON_WORD_PATTERN.sub(' ', text)
    # 2. Normalize multiple spaces and split into tokens
    tokens = cleaned.split()
    return "".join(tokens)