From 9a8b240e4eeee1420307ee415078b3e8c6c28826 Mon Sep 17 00:00:00 2001
From: goulustis <cnimaiv@gmail.com>
Date: Sat, 22 Nov 2025 19:54:45 +0800
Subject: [PATCH] words_only

---
 lang_agent/utils.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/lang_agent/utils.py b/lang_agent/utils.py
index 9adf7ec..d5ce27e 100644
--- a/lang_agent/utils.py
+++ b/lang_agent/utils.py
@@ -1,5 +1,6 @@
 from langchain.chat_models import init_chat_model
 from langchain_core.language_models import BaseChatModel
+import re
 
 import os
 from dotenv import load_dotenv
@@ -18,4 +19,22 @@ def make_llm(model="qwen-plus",
                           base_url=base_url,
                           **kwargs)
     
-    return llm
\ No newline at end of file
+    return llm
+
+NON_WORD_PATTERN = re.compile(r'[^\u4e00-\u9fffA-Za-z0-9_\s]')
+def words_only(text):
+    """
+    Keep only:
+        - Chinese characters (U+4E00–U+9FFF)
+        - Latin letters, digits, underscore
+        - Whitespace (as separators)
+    Strip punctuation, emojis, etc.
+    Return a list of tokens (Chinese blocks or Latin word blocks).
+    """
+    # 1. Replace all non-allowed characters with a space
+    cleaned = NON_WORD_PATTERN.sub(' ', text)
+
+    # 2. Normalize multiple spaces and split into tokens
+    tokens = cleaned.split()
+
+    return "".join(tokens)
\ No newline at end of file