first commit

2026-03-09 22:03:09 +08:00
commit 3a6a12eeb6
8 changed files with 2168 additions and 0 deletions
--- a/processor.py
+++ b/processor.py
@@ -0,0 +1,276 @@
+"""
+数据处理模块 - 增强版去重与分类
+"""
+
+import re
+import hashlib
+from typing import List, Dict, Set, Tuple
+from collections import defaultdict
+from datetime import datetime
+
+
+class TextSimilarity:
+    """文本相似度计算"""
+
+    @staticmethod
+    def jaccard_similarity(text1: str, text2: str) -> float:
+        """Jaccard相似度"""
+        if not text1 or not text2:
+            return 0.0
+
+        set1 = set(TextSimilarity.tokenize(text1))
+        set2 = set(TextSimilarity.tokenize(text2))
+
+        intersection = len(set1 & set2)
+        union = len(set1 | set2)
+
+        return intersection / union if union > 0 else 0.0
+
+    @staticmethod
+    def tokenize(text: str) -> List[str]:
+        """分词"""
+        text = re.sub(r'[^\w\s]', ' ', text.lower())
+        return [w for w in text.split() if len(w) > 1]
+
+    @staticmethod
+    def levenshtein_distance(s1: str, s2: str) -> int:
+        """编辑距离"""
+        if len(s1) < len(s2):
+            return TextSimilarity.levenshtein_distance(s2, s1)
+
+        if len(s2) == 0:
+            return len(s1)
+
+        previous_row = range(len(s2) + 1)
+        for i, c1 in enumerate(s1):
+            current_row = [i + 1]
+            for j, c2 in enumerate(s2):
+                insertions = previous_row[j + 1] + 1
+                deletions = current_row[j] + 1
+                substitutions = previous_row[j] + (c1 != c2)
+                current_row.append(min(insertions, deletions, substitutions))
+            previous_row = current_row
+
+        return previous_row[-1]
+
+    @staticmethod
+    def normalized_levenshtein(s1: str, s2: str) -> float:
+        """归一化编辑距离"""
+        if not s1 or not s2:
+            return 0.0
+
+        max_len = max(len(s1), len(s2))
+        distance = TextSimilarity.levenshtein_distance(s1, s2)
+        return 1 - (distance / max_len)
+
+    @staticmethod
+    def cosine_similarity(text1: str, text2: str) -> float:
+        """余弦相似度"""
+        if not text1 or not text2:
+            return 0.0
+
+        vec1 = TextSimilarity._get_vector(text1)
+        vec2 = TextSimilarity._get_vector(text2)
+
+        dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
+        magnitude1 = sum(v ** 2 for v in vec1) ** 0.5
+        magnitude2 = sum(v ** 2 for v in vec2) ** 0.5
+
+        if magnitude1 == 0 or magnitude2 == 0:
+            return 0.0
+
+        return dot_product / (magnitude1 * magnitude2)
+
+    @staticmethod
+    def _get_vector(text: str) -> List[float]:
+        """获取词频向量"""
+        tokens = TextSimilarity.tokenize(text)
+        unique_tokens = list(set(tokens))
+        tf = defaultdict(int)
+
+        for token in tokens:
+            tf[token] += 1
+
+        return [tf[t] for t in unique_tokens]
+
+
+class Deduplicator:
+    """去重处理器"""
+
+    def __init__(self, title_threshold: float = 0.8, content_threshold: float = 0.9):
+        self.title_threshold = title_threshold
+        self.content_threshold = content_threshold
+        self.seen_titles: Set[str] = set()
+        self.seen_content_hashes: Set[str] = set()
+
+    def deduplicate(self, articles: List[Dict]) -> List[Dict]:
+        """去重"""
+        unique_articles = []
+        seen_with_date = {}
+
+        for article in articles:
+            title = article.get('title', '').strip()
+            content = article.get('content', '')
+            publish_date = article.get('publish_date', '')
+
+            if self._is_duplicate_title(title, publish_date, seen_with_date):
+                continue
+
+            if self._is_duplicate_content(content):
+                continue
+
+            self.seen_titles.add(self._normalize_title(title))
+            self.seen_content_hashes.add(self._hash_content(content))
+
+            seen_with_date[title] = article
+            unique_articles.append(article)
+
+        return unique_articles
+
+    def _normalize_title(self, title: str) -> str:
+        """标准化标题"""
+        title = re.sub(r'\s+', '', title)
+        title = title.lower()
+        return title
+
+    def _is_duplicate_title(self, title: str, publish_date: str, seen: Dict) -> bool:
+        """检查标题是否重复"""
+        normalized = self._normalize_title(title)
+
+        for seen_title in self.seen_titles:
+            similarity = TextSimilarity.normalized_levenshtein(normalized, seen_title)
+            if similarity >= self.title_threshold:
+                if seen_title in seen:
+                    existing_date = seen[seen_title].get('publish_date', '')
+                    if publish_date and existing_date:
+                        try:
+                            if self._parse_date(publish_date) < self._parse_date(existing_date):
+                                return True
+                        except:
+                            pass
+                return True
+
+        return False
+
+    def _is_duplicate_content(self, content: str) -> bool:
+        """检查内容是否重复"""
+        if not content:
+            return False
+
+        content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
+        return content_hash in self.seen_content_hashes
+
+    def _hash_content(self, content: str) -> str:
+        """内容哈希"""
+        return hashlib.md5(content.encode('utf-8')).hexdigest()
+
+    def _parse_date(self, date_str: str) -> datetime:
+        """解析日期"""
+        date_str = date_str.replace('年', '-').replace('月', '-').replace('日', '')
+
+        formats = ['%Y-%m-%d', '%Y-%m', '%Y']
+        for fmt in formats:
+            try:
+                return datetime.strptime(date_str, fmt)
+            except:
+                continue
+
+        return datetime.min
+
+
+class CategoryClassifier:
+    """分类器"""
+
+    def __init__(self, categories: List[Dict]):
+        self.categories = sorted(categories, key=lambda x: x.get('priority', 99))
+        self.keyword_index = self._build_index()
+
+    def _build_index(self) -> Dict:
+        """构建关键词索引"""
+        index = {}
+        for category in self.categories:
+            for keyword in category.get('keywords', []):
+                index[keyword] = category['name']
+        return index
+
+    def classify(self, article: Dict) -> str:
+        """分类"""
+        title = article.get('title', '')
+        content = article.get('content', '')
+        full_text = f"{title} {content}"
+
+        scores = {}
+        for category in self.categories:
+            score = 0
+            for keyword in category.get('keywords', []):
+                if keyword in full_text:
+                    score += full_text.count(keyword)
+            scores[category['name']] = score
+
+        if not scores or max(scores.values()) == 0:
+            return '其他政策'
+
+        return max(scores, key=scores.get)
+
+    def classify_batch(self, articles: List[Dict]) -> Dict[str, List[Dict]]:
+        """批量分类"""
+        result = {cat['name']: [] for cat in self.categories}
+        result['其他政策'] = []
+
+        for article in articles:
+            category = self.classify(article)
+            result[category].append(article)
+
+        return result
+
+
+class DataExporter:
+    """数据导出器"""
+
+    @staticmethod
+    def to_excel(articles: List[Dict], filepath: str):
+        """导出为Excel"""
+        import pandas as pd
+
+        df_data = []
+        for article in articles:
+            df_data.append({
+                '标题': article.get('title', ''),
+                '发布时间': article.get('publish_date', ''),
+                '来源': article.get('source', ''),
+                '类别': article.get('category', '其他政策'),
+                '摘要': article.get('summary', ''),
+                '关键词': ', '.join(article.get('keywords', [])),
+                '原文链接': article.get('url', ''),
+                '本地路径': article.get('local_path', ''),
+                '抓取时间': article.get('fetch_time', '')
+            })
+
+        df = pd.DataFrame(df_data)
+        df.to_excel(filepath, index=False, engine='openpyxl')
+
+    @staticmethod
+    def to_json(articles: List[Dict], filepath: str):
+        """导出为JSON"""
+        import json
+
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(articles, f, ensure_ascii=False, indent=2)
+
+    @staticmethod
+    def to_csv(articles: List[Dict], filepath: str):
+        """导出为CSV"""
+        import pandas as pd
+
+        df_data = []
+        for article in articles:
+            df_data.append({
+                '标题': article.get('title', ''),
+                '发布时间': article.get('publish_date', ''),
+                '来源': article.get('source', ''),
+                '类别': article.get('category', '其他政策'),
+                '摘要': article.get('summary', '')
+            })
+
+        df = pd.DataFrame(df_data)
+        df.to_csv(filepath, index=False, encoding='utf-8-sig')