first commit

This commit is contained in:
辫子哥
2026-03-09 22:03:09 +08:00
commit 3a6a12eeb6
8 changed files with 2168 additions and 0 deletions

276
processor.py Normal file
View File

@@ -0,0 +1,276 @@
"""
数据处理模块 - 增强版去重与分类
"""
import re
import hashlib
from typing import List, Dict, Set, Tuple
from collections import defaultdict
from datetime import datetime
class TextSimilarity:
"""文本相似度计算"""
@staticmethod
def jaccard_similarity(text1: str, text2: str) -> float:
"""Jaccard相似度"""
if not text1 or not text2:
return 0.0
set1 = set(TextSimilarity.tokenize(text1))
set2 = set(TextSimilarity.tokenize(text2))
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
@staticmethod
def tokenize(text: str) -> List[str]:
"""分词"""
text = re.sub(r'[^\w\s]', ' ', text.lower())
return [w for w in text.split() if len(w) > 1]
@staticmethod
def levenshtein_distance(s1: str, s2: str) -> int:
"""编辑距离"""
if len(s1) < len(s2):
return TextSimilarity.levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
@staticmethod
def normalized_levenshtein(s1: str, s2: str) -> float:
"""归一化编辑距离"""
if not s1 or not s2:
return 0.0
max_len = max(len(s1), len(s2))
distance = TextSimilarity.levenshtein_distance(s1, s2)
return 1 - (distance / max_len)
@staticmethod
def cosine_similarity(text1: str, text2: str) -> float:
"""余弦相似度"""
if not text1 or not text2:
return 0.0
vec1 = TextSimilarity._get_vector(text1)
vec2 = TextSimilarity._get_vector(text2)
dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
magnitude1 = sum(v ** 2 for v in vec1) ** 0.5
magnitude2 = sum(v ** 2 for v in vec2) ** 0.5
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
@staticmethod
def _get_vector(text: str) -> List[float]:
"""获取词频向量"""
tokens = TextSimilarity.tokenize(text)
unique_tokens = list(set(tokens))
tf = defaultdict(int)
for token in tokens:
tf[token] += 1
return [tf[t] for t in unique_tokens]
class Deduplicator:
"""去重处理器"""
def __init__(self, title_threshold: float = 0.8, content_threshold: float = 0.9):
self.title_threshold = title_threshold
self.content_threshold = content_threshold
self.seen_titles: Set[str] = set()
self.seen_content_hashes: Set[str] = set()
def deduplicate(self, articles: List[Dict]) -> List[Dict]:
"""去重"""
unique_articles = []
seen_with_date = {}
for article in articles:
title = article.get('title', '').strip()
content = article.get('content', '')
publish_date = article.get('publish_date', '')
if self._is_duplicate_title(title, publish_date, seen_with_date):
continue
if self._is_duplicate_content(content):
continue
self.seen_titles.add(self._normalize_title(title))
self.seen_content_hashes.add(self._hash_content(content))
seen_with_date[title] = article
unique_articles.append(article)
return unique_articles
def _normalize_title(self, title: str) -> str:
"""标准化标题"""
title = re.sub(r'\s+', '', title)
title = title.lower()
return title
def _is_duplicate_title(self, title: str, publish_date: str, seen: Dict) -> bool:
"""检查标题是否重复"""
normalized = self._normalize_title(title)
for seen_title in self.seen_titles:
similarity = TextSimilarity.normalized_levenshtein(normalized, seen_title)
if similarity >= self.title_threshold:
if seen_title in seen:
existing_date = seen[seen_title].get('publish_date', '')
if publish_date and existing_date:
try:
if self._parse_date(publish_date) < self._parse_date(existing_date):
return True
except:
pass
return True
return False
def _is_duplicate_content(self, content: str) -> bool:
"""检查内容是否重复"""
if not content:
return False
content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
return content_hash in self.seen_content_hashes
def _hash_content(self, content: str) -> str:
"""内容哈希"""
return hashlib.md5(content.encode('utf-8')).hexdigest()
def _parse_date(self, date_str: str) -> datetime:
"""解析日期"""
date_str = date_str.replace('', '-').replace('', '-').replace('', '')
formats = ['%Y-%m-%d', '%Y-%m', '%Y']
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except:
continue
return datetime.min
class CategoryClassifier:
"""分类器"""
def __init__(self, categories: List[Dict]):
self.categories = sorted(categories, key=lambda x: x.get('priority', 99))
self.keyword_index = self._build_index()
def _build_index(self) -> Dict:
"""构建关键词索引"""
index = {}
for category in self.categories:
for keyword in category.get('keywords', []):
index[keyword] = category['name']
return index
def classify(self, article: Dict) -> str:
"""分类"""
title = article.get('title', '')
content = article.get('content', '')
full_text = f"{title} {content}"
scores = {}
for category in self.categories:
score = 0
for keyword in category.get('keywords', []):
if keyword in full_text:
score += full_text.count(keyword)
scores[category['name']] = score
if not scores or max(scores.values()) == 0:
return '其他政策'
return max(scores, key=scores.get)
def classify_batch(self, articles: List[Dict]) -> Dict[str, List[Dict]]:
"""批量分类"""
result = {cat['name']: [] for cat in self.categories}
result['其他政策'] = []
for article in articles:
category = self.classify(article)
result[category].append(article)
return result
class DataExporter:
"""数据导出器"""
@staticmethod
def to_excel(articles: List[Dict], filepath: str):
"""导出为Excel"""
import pandas as pd
df_data = []
for article in articles:
df_data.append({
'标题': article.get('title', ''),
'发布时间': article.get('publish_date', ''),
'来源': article.get('source', ''),
'类别': article.get('category', '其他政策'),
'摘要': article.get('summary', ''),
'关键词': ', '.join(article.get('keywords', [])),
'原文链接': article.get('url', ''),
'本地路径': article.get('local_path', ''),
'抓取时间': article.get('fetch_time', '')
})
df = pd.DataFrame(df_data)
df.to_excel(filepath, index=False, engine='openpyxl')
@staticmethod
def to_json(articles: List[Dict], filepath: str):
"""导出为JSON"""
import json
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
@staticmethod
def to_csv(articles: List[Dict], filepath: str):
"""导出为CSV"""
import pandas as pd
df_data = []
for article in articles:
df_data.append({
'标题': article.get('title', ''),
'发布时间': article.get('publish_date', ''),
'来源': article.get('source', ''),
'类别': article.get('category', '其他政策'),
'摘要': article.get('summary', '')
})
df = pd.DataFrame(df_data)
df.to_csv(filepath, index=False, encoding='utf-8-sig')