From 9a8b240e4eeee1420307ee415078b3e8c6c28826 Mon Sep 17 00:00:00 2001 From: goulustis Date: Sat, 22 Nov 2025 19:54:45 +0800 Subject: [PATCH] words_only --- lang_agent/utils.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lang_agent/utils.py b/lang_agent/utils.py index 9adf7ec..d5ce27e 100644 --- a/lang_agent/utils.py +++ b/lang_agent/utils.py @@ -1,5 +1,6 @@ from langchain.chat_models import init_chat_model from langchain_core.language_models import BaseChatModel +import re import os from dotenv import load_dotenv @@ -18,4 +19,22 @@ def make_llm(model="qwen-plus", base_url=base_url, **kwargs) - return llm \ No newline at end of file + return llm + +NON_WORD_PATTERN = re.compile(r'[^\u4e00-\u9fffA-Za-z0-9_\s]') +def words_only(text): + """ + Keep only: + - Chinese characters (U+4E00–U+9FFF) + - Latin letters, digits, underscore + - Whitespace (as separators) + Strip punctuation, emojis, etc. + Return a list of tokens (Chinese blocks or Latin word blocks). + """ + # 1. Replace all non-allowed characters with a space + cleaned = NON_WORD_PATTERN.sub(' ', text) + + # 2. Normalize multiple spaces and split into tokens + tokens = cleaned.split() + + return "".join(tokens) \ No newline at end of file