first commit
This commit is contained in:
276
processor.py
Normal file
276
processor.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
数据处理模块 - 增强版去重与分类
|
||||
"""
|
||||
|
||||
import re
|
||||
import hashlib
|
||||
from typing import List, Dict, Set, Tuple
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class TextSimilarity:
|
||||
"""文本相似度计算"""
|
||||
|
||||
@staticmethod
|
||||
def jaccard_similarity(text1: str, text2: str) -> float:
|
||||
"""Jaccard相似度"""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
set1 = set(TextSimilarity.tokenize(text1))
|
||||
set2 = set(TextSimilarity.tokenize(text2))
|
||||
|
||||
intersection = len(set1 & set2)
|
||||
union = len(set1 | set2)
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
@staticmethod
|
||||
def tokenize(text: str) -> List[str]:
|
||||
"""分词"""
|
||||
text = re.sub(r'[^\w\s]', ' ', text.lower())
|
||||
return [w for w in text.split() if len(w) > 1]
|
||||
|
||||
@staticmethod
|
||||
def levenshtein_distance(s1: str, s2: str) -> int:
|
||||
"""编辑距离"""
|
||||
if len(s1) < len(s2):
|
||||
return TextSimilarity.levenshtein_distance(s2, s1)
|
||||
|
||||
if len(s2) == 0:
|
||||
return len(s1)
|
||||
|
||||
previous_row = range(len(s2) + 1)
|
||||
for i, c1 in enumerate(s1):
|
||||
current_row = [i + 1]
|
||||
for j, c2 in enumerate(s2):
|
||||
insertions = previous_row[j + 1] + 1
|
||||
deletions = current_row[j] + 1
|
||||
substitutions = previous_row[j] + (c1 != c2)
|
||||
current_row.append(min(insertions, deletions, substitutions))
|
||||
previous_row = current_row
|
||||
|
||||
return previous_row[-1]
|
||||
|
||||
@staticmethod
|
||||
def normalized_levenshtein(s1: str, s2: str) -> float:
|
||||
"""归一化编辑距离"""
|
||||
if not s1 or not s2:
|
||||
return 0.0
|
||||
|
||||
max_len = max(len(s1), len(s2))
|
||||
distance = TextSimilarity.levenshtein_distance(s1, s2)
|
||||
return 1 - (distance / max_len)
|
||||
|
||||
@staticmethod
|
||||
def cosine_similarity(text1: str, text2: str) -> float:
|
||||
"""余弦相似度"""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
vec1 = TextSimilarity._get_vector(text1)
|
||||
vec2 = TextSimilarity._get_vector(text2)
|
||||
|
||||
dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
|
||||
magnitude1 = sum(v ** 2 for v in vec1) ** 0.5
|
||||
magnitude2 = sum(v ** 2 for v in vec2) ** 0.5
|
||||
|
||||
if magnitude1 == 0 or magnitude2 == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude1 * magnitude2)
|
||||
|
||||
@staticmethod
|
||||
def _get_vector(text: str) -> List[float]:
|
||||
"""获取词频向量"""
|
||||
tokens = TextSimilarity.tokenize(text)
|
||||
unique_tokens = list(set(tokens))
|
||||
tf = defaultdict(int)
|
||||
|
||||
for token in tokens:
|
||||
tf[token] += 1
|
||||
|
||||
return [tf[t] for t in unique_tokens]
|
||||
|
||||
|
||||
class Deduplicator:
|
||||
"""去重处理器"""
|
||||
|
||||
def __init__(self, title_threshold: float = 0.8, content_threshold: float = 0.9):
|
||||
self.title_threshold = title_threshold
|
||||
self.content_threshold = content_threshold
|
||||
self.seen_titles: Set[str] = set()
|
||||
self.seen_content_hashes: Set[str] = set()
|
||||
|
||||
def deduplicate(self, articles: List[Dict]) -> List[Dict]:
|
||||
"""去重"""
|
||||
unique_articles = []
|
||||
seen_with_date = {}
|
||||
|
||||
for article in articles:
|
||||
title = article.get('title', '').strip()
|
||||
content = article.get('content', '')
|
||||
publish_date = article.get('publish_date', '')
|
||||
|
||||
if self._is_duplicate_title(title, publish_date, seen_with_date):
|
||||
continue
|
||||
|
||||
if self._is_duplicate_content(content):
|
||||
continue
|
||||
|
||||
self.seen_titles.add(self._normalize_title(title))
|
||||
self.seen_content_hashes.add(self._hash_content(content))
|
||||
|
||||
seen_with_date[title] = article
|
||||
unique_articles.append(article)
|
||||
|
||||
return unique_articles
|
||||
|
||||
def _normalize_title(self, title: str) -> str:
|
||||
"""标准化标题"""
|
||||
title = re.sub(r'\s+', '', title)
|
||||
title = title.lower()
|
||||
return title
|
||||
|
||||
def _is_duplicate_title(self, title: str, publish_date: str, seen: Dict) -> bool:
|
||||
"""检查标题是否重复"""
|
||||
normalized = self._normalize_title(title)
|
||||
|
||||
for seen_title in self.seen_titles:
|
||||
similarity = TextSimilarity.normalized_levenshtein(normalized, seen_title)
|
||||
if similarity >= self.title_threshold:
|
||||
if seen_title in seen:
|
||||
existing_date = seen[seen_title].get('publish_date', '')
|
||||
if publish_date and existing_date:
|
||||
try:
|
||||
if self._parse_date(publish_date) < self._parse_date(existing_date):
|
||||
return True
|
||||
except:
|
||||
pass
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_duplicate_content(self, content: str) -> bool:
|
||||
"""检查内容是否重复"""
|
||||
if not content:
|
||||
return False
|
||||
|
||||
content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
|
||||
return content_hash in self.seen_content_hashes
|
||||
|
||||
def _hash_content(self, content: str) -> str:
|
||||
"""内容哈希"""
|
||||
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def _parse_date(self, date_str: str) -> datetime:
|
||||
"""解析日期"""
|
||||
date_str = date_str.replace('年', '-').replace('月', '-').replace('日', '')
|
||||
|
||||
formats = ['%Y-%m-%d', '%Y-%m', '%Y']
|
||||
for fmt in formats:
|
||||
try:
|
||||
return datetime.strptime(date_str, fmt)
|
||||
except:
|
||||
continue
|
||||
|
||||
return datetime.min
|
||||
|
||||
|
||||
class CategoryClassifier:
|
||||
"""分类器"""
|
||||
|
||||
def __init__(self, categories: List[Dict]):
|
||||
self.categories = sorted(categories, key=lambda x: x.get('priority', 99))
|
||||
self.keyword_index = self._build_index()
|
||||
|
||||
def _build_index(self) -> Dict:
|
||||
"""构建关键词索引"""
|
||||
index = {}
|
||||
for category in self.categories:
|
||||
for keyword in category.get('keywords', []):
|
||||
index[keyword] = category['name']
|
||||
return index
|
||||
|
||||
def classify(self, article: Dict) -> str:
|
||||
"""分类"""
|
||||
title = article.get('title', '')
|
||||
content = article.get('content', '')
|
||||
full_text = f"{title} {content}"
|
||||
|
||||
scores = {}
|
||||
for category in self.categories:
|
||||
score = 0
|
||||
for keyword in category.get('keywords', []):
|
||||
if keyword in full_text:
|
||||
score += full_text.count(keyword)
|
||||
scores[category['name']] = score
|
||||
|
||||
if not scores or max(scores.values()) == 0:
|
||||
return '其他政策'
|
||||
|
||||
return max(scores, key=scores.get)
|
||||
|
||||
def classify_batch(self, articles: List[Dict]) -> Dict[str, List[Dict]]:
|
||||
"""批量分类"""
|
||||
result = {cat['name']: [] for cat in self.categories}
|
||||
result['其他政策'] = []
|
||||
|
||||
for article in articles:
|
||||
category = self.classify(article)
|
||||
result[category].append(article)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class DataExporter:
|
||||
"""数据导出器"""
|
||||
|
||||
@staticmethod
|
||||
def to_excel(articles: List[Dict], filepath: str):
|
||||
"""导出为Excel"""
|
||||
import pandas as pd
|
||||
|
||||
df_data = []
|
||||
for article in articles:
|
||||
df_data.append({
|
||||
'标题': article.get('title', ''),
|
||||
'发布时间': article.get('publish_date', ''),
|
||||
'来源': article.get('source', ''),
|
||||
'类别': article.get('category', '其他政策'),
|
||||
'摘要': article.get('summary', ''),
|
||||
'关键词': ', '.join(article.get('keywords', [])),
|
||||
'原文链接': article.get('url', ''),
|
||||
'本地路径': article.get('local_path', ''),
|
||||
'抓取时间': article.get('fetch_time', '')
|
||||
})
|
||||
|
||||
df = pd.DataFrame(df_data)
|
||||
df.to_excel(filepath, index=False, engine='openpyxl')
|
||||
|
||||
@staticmethod
|
||||
def to_json(articles: List[Dict], filepath: str):
|
||||
"""导出为JSON"""
|
||||
import json
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(articles, f, ensure_ascii=False, indent=2)
|
||||
|
||||
@staticmethod
|
||||
def to_csv(articles: List[Dict], filepath: str):
|
||||
"""导出为CSV"""
|
||||
import pandas as pd
|
||||
|
||||
df_data = []
|
||||
for article in articles:
|
||||
df_data.append({
|
||||
'标题': article.get('title', ''),
|
||||
'发布时间': article.get('publish_date', ''),
|
||||
'来源': article.get('source', ''),
|
||||
'类别': article.get('category', '其他政策'),
|
||||
'摘要': article.get('summary', '')
|
||||
})
|
||||
|
||||
df = pd.DataFrame(df_data)
|
||||
df.to_csv(filepath, index=False, encoding='utf-8-sig')
|
||||
Reference in New Issue
Block a user