Files
bzg_skills/policy_retrieval.py
2026-03-09 22:03:09 +08:00

494 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
政策法规检索与整理系统
自动化从中国税务相关部门网站抓取、筛选、下载和整理政策法规文件
"""
import argparse
import logging
import os
import sys
import json
import hashlib
import time
import re
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from urllib.parse import urljoin, urlparse
import subprocess
import yaml
import requests
from bs4 import BeautifulSoup
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
import pandas as pd
from notifier import EmailNotifier
class PolicyRetrievalSystem:
"""政策法规检索与整理系统主类"""
def __init__(self, config_path: str = None):
self.base_dir = Path(__file__).parent
self.config_path = config_path or str(self.base_dir / "config.yaml")
self.config = self._load_config()
self.setup_logging()
self.logger = logging.getLogger(__name__)
self.scheduler = None
self.results = []
self.notifier = EmailNotifier(self.config)
self.recipients = self.config.get('notification', {}).get('email', {}).get('to_addrs', [])
def _load_config(self) -> dict:
"""加载配置文件"""
try:
with open(self.config_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except FileNotFoundError:
return self._default_config()
def _default_config(self) -> dict:
"""默认配置"""
return {
'scheduler': {'enabled': False, 'time': '09:00', 'days': ['mon', 'tue', 'wed', 'thu', 'fri']},
'targets': [{'name': '国家税务总局', 'url': 'https://www.chinatax.gov.cn/', 'enabled': True}],
'download': {'path': './downloads', 'formats': ['pdf', 'doc', 'docx', 'txt']},
'deduplication': {'title_similarity': 0.8, 'content_similarity': 0.9},
'categories': [{'name': '税收政策', 'keywords': ['税收', '税务']}]
}
def setup_logging(self):
"""设置日志"""
log_config = self.config.get('logging', {})
log_dir = self.base_dir / 'logs'
log_dir.mkdir(exist_ok=True)
logging.basicConfig(
level=getattr(logging, log_config.get('level', 'INFO')),
format=log_config.get('format', '%(asctime)s - %(name)s - %(levelname)s - %(message)s'),
handlers=[
logging.FileHandler(log_config.get('file', './logs/policy_retrieval.log')),
logging.StreamHandler()
]
)
def run(self, send_email: bool = True):
"""执行一次完整的检索流程
Args:
send_email: 是否发送邮件通知默认为True
"""
self.logger.info("=" * 60)
self.logger.info("开始执行政策法规检索任务")
self.logger.info("=" * 60)
self.results = []
targets = [t for t in self.config.get('targets', []) if t.get('enabled', False)]
for target in targets:
self.logger.info(f"正在检索: {target['name']}")
try:
articles = self.fetch_articles(target)
self.logger.info(f"{target['name']} 获取到 {len(articles)} 条记录")
self.results.extend(articles)
except Exception as e:
self.logger.error(f"检索 {target['name']} 时出错: {e}")
self.logger.info(f"共获取 {len(self.results)} 条原始记录")
filtered_results = self.filter_content(self.results)
self.logger.info(f"筛选后保留 {len(filtered_results)} 条记录")
deduplicated = self.deduplicate(filtered_results)
self.logger.info(f"去重后保留 {len(deduplicated)} 条记录")
categorized = self.categorize(deduplicated)
self.logger.info(f"分类完成,共 {len(categorized)} 个类别")
downloaded = self.download_files(categorized)
self.logger.info(f"文件下载完成,{len(downloaded)} 个文件")
report_file = self.generate_report(downloaded)
self.logger.info("=" * 60)
self.logger.info("政策法规检索任务完成")
self.logger.info("=" * 60)
if send_email and self.recipients:
self.logger.info(f"正在发送邮件报告到: {self.recipients}")
for article in downloaded:
article['category'] = self.get_category(article)
success = self.notifier.send_policy_report(
articles=downloaded,
to_addrs=self.recipients,
report_file=str(report_file) if report_file else None
)
if success:
self.logger.info("邮件报告发送成功")
else:
self.logger.warning("邮件报告发送失败")
return downloaded
def fetch_articles(self, target: Dict) -> List[Dict]:
"""从目标网站获取文章列表"""
articles = []
keywords = target.get('keywords', [])
base_url = target['url']
try:
headers = {
'User-Agent': self.config.get('download', {}).get('user_agent', 'Mozilla/5.0')
}
response = requests.get(base_url, headers=headers, timeout=30)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
href = link.get('href', '')
text = link.get_text(strip=True)
if any(kw in text for kw in keywords):
full_url = urljoin(base_url, href)
article = {
'title': text,
'url': full_url,
'source': target['name'],
'fetch_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'keywords': [kw for kw in keywords if kw in text]
}
articles.append(article)
for article in articles:
try:
detail = self.fetch_article_detail(article['url'], headers)
article.update(detail)
except Exception as e:
self.logger.warning(f"获取详情失败: {article['url']} - {e}")
except Exception as e:
self.logger.error(f"抓取 {target['name']} 失败: {e}")
return articles
def fetch_article_detail(self, url: str, headers: Dict) -> Dict:
"""获取文章详情"""
detail = {'publish_date': '', 'content': '', 'summary': '', 'file_url': ''}
try:
response = requests.get(url, headers=headers, timeout=30)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
date_pattern = r'(\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?)'
text_content = soup.get_text()
date_match = re.search(date_pattern, text_content)
if date_match:
detail['publish_date'] = date_match.group(1).replace('', '-').replace('', '-').replace('', '')
main_content = soup.find('div', class_=re.compile('content|article|text'))
if main_content:
detail['content'] = main_content.get_text(strip=True)[:500]
detail['summary'] = detail['content'][:200] + '...' if len(detail['content']) > 200 else detail['content']
file_links = soup.find_all('a', href=re.compile(r'\.(pdf|doc|docx|xls|xlsx|txt)$', re.I))
if file_links:
detail['file_url'] = file_links[0].get('href', '')
except Exception as e:
self.logger.warning(f"解析详情失败: {url} - {e}")
return detail
def filter_content(self, articles: List[Dict]) -> List[Dict]:
"""筛选相关内容"""
filter_keywords = ['最新', '通知', '公告', '政策', '法规']
filtered = []
for article in articles:
title = article.get('title', '')
if any(kw in title for kw in filter_keywords):
filtered.append(article)
return filtered
def deduplicate(self, articles: List[Dict]) -> List[Dict]:
"""内容去重"""
dedup_config = self.config.get('deduplication', {})
title_threshold = dedup_config.get('title_similarity', 0.8)
seen = {}
unique_articles = []
for article in articles:
title_hash = hashlib.md5(article.get('title', '').encode()).hexdigest()
is_duplicate = False
for seen_title, seen_data in seen.items():
similarity = self.calculate_similarity(article.get('title', ''), seen_title)
if similarity >= title_threshold:
if article.get('publish_date') < seen_data.get('publish_date'):
del seen[seen_title]
seen[article.get('title', '')] = article
is_duplicate = True
break
if not is_duplicate:
seen[article.get('title', '')] = article
unique_articles.append(article)
return unique_articles
def calculate_similarity(self, text1: str, text2: str) -> float:
"""计算文本相似度"""
if not text1 or not text2:
return 0.0
set1 = set(text1)
set2 = set(text2)
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
def categorize(self, articles: List[Dict]) -> Dict[str, List[Dict]]:
"""分类整理"""
categories_config = self.config.get('categories', [])
categorized = {}
for category in categories_config:
categorized[category['name']] = []
categorized['其他政策'] = []
for article in articles:
content = article.get('title', '') + ' ' + article.get('content', '')
assigned = False
for category in sorted(categories_config, key=lambda x: x.get('priority', 99)):
keywords = category.get('keywords', [])
if any(kw in content for kw in keywords):
categorized[category['name']].append(article)
assigned = True
break
if not assigned:
categorized['其他政策'].append(article)
return categorized
def download_files(self, categorized: Dict[str, List[Dict]]) -> List[Dict]:
"""下载文件"""
download_config = self.config.get('download', {})
download_path = Path(download_config.get('path', './downloads'))
download_path.mkdir(parents=True, exist_ok=True)
formats = download_config.get('formats', ['pdf', 'doc', 'docx', 'txt'])
downloaded = []
for category, articles in categorized.items():
category_path = download_path / category
category_path.mkdir(exist_ok=True)
for article in articles:
file_url = article.get('file_url', '')
if not file_url:
continue
if any(file_url.lower().endswith(f'.{fmt}') for fmt in formats):
try:
filename = self.download_file(file_url, category_path)
article['local_path'] = str(category_path / filename)
downloaded.append(article)
except Exception as e:
self.logger.warning(f"下载失败: {file_url} - {e}")
return downloaded
def download_file(self, url: str, save_path: Path) -> str:
"""下载单个文件"""
headers = {'User-Agent': self.config.get('download', {}).get('user_agent', 'Mozilla/5.0')}
response = requests.get(url, headers=headers, timeout=60, stream=True)
response.raise_for_status()
filename = Path(urlparse(url).path).name
if not filename:
filename = f"document_{int(time.time())}.pdf"
filepath = save_path / filename
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return filename
def generate_report(self, articles: List[Dict]) -> str:
"""生成汇总报告
Returns:
str: 报告文件路径
"""
output_dir = self.base_dir / 'output'
output_dir.mkdir(exist_ok=True)
today = datetime.now().strftime('%Y%m%d')
report_file = output_dir / f'summary_{today}.xlsx'
if not articles:
self.logger.warning("没有数据生成报告")
return ""
df_data = []
for article in articles:
df_data.append({
'标题': article.get('title', ''),
'发布时间': article.get('publish_date', ''),
'来源': article.get('source', ''),
'类别': self.get_category(article),
'摘要': article.get('summary', ''),
'下载链接': article.get('local_path', article.get('file_url', '')),
'关键词': ', '.join(article.get('keywords', [])),
'抓取时间': article.get('fetch_time', '')
})
df = pd.DataFrame(df_data)
df.to_excel(report_file, index=False, engine='openpyxl')
json_file = output_dir / f'deduplicated_data_{today}.json'
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
self.logger.info(f"报告已生成: {report_file}")
self.logger.info(f"数据已保存: {json_file}")
return str(report_file)
def get_category(self, article: Dict) -> str:
"""获取文章类别"""
content = article.get('title', '') + ' ' + article.get('content', '')
categories = self.config.get('categories', [])
for category in sorted(categories, key=lambda x: x.get('priority', 99)):
keywords = category.get('keywords', [])
if any(kw in content for kw in keywords):
return category['name']
return '其他政策'
def start_scheduler(self):
"""启动定时任务"""
scheduler_config = self.config.get('scheduler', {})
if not scheduler_config.get('enabled', False):
self.logger.info("定时任务未启用")
return
self.scheduler = BlockingScheduler()
time_parts = scheduler_config.get('time', '09:00').split(':')
hour, minute = int(time_parts[0]), int(time_parts[1])
days_map = {'mon': '0', 'tue': '1', 'wed': '2', 'thu': '3', 'fri': '4', 'sat': '5', 'sun': '6'}
days = [days_map.get(d, '0') for d in scheduler_config.get('days', ['mon', 'tue', 'wed', 'thu', 'fri'])]
trigger = CronTrigger(
day_of_week=','.join(days),
hour=hour,
minute=minute
)
self.scheduler.add_job(self.run, trigger, id='policy_retrieval')
self.logger.info(f"定时任务已启动,将在每天 {scheduler_config['time']} 执行")
try:
self.scheduler.start()
except (KeyboardInterrupt, SystemExit):
self.logger.info("定时任务已停止")
self.scheduler.shutdown()
def init_config(self):
"""初始化配置文件"""
self.logger.info("配置文件已就绪")
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='政策法规检索与整理系统')
parser.add_argument('command', choices=['init', 'run', 'schedule', 'report', 'help'],
help='命令: init=初始化, run=立即执行, schedule=定时任务, report=查看报告, help=帮助')
parser.add_argument('--config', '-c', help='配置文件路径')
parser.add_argument('--time', '-t', help='定时任务时间 (如: 09:00)')
parser.add_argument('--enable', action='store_true', help='启用定时任务')
parser.add_argument('--disable', action='store_true', help='禁用定时任务')
parser.add_argument('--no-email', action='store_true', help='不发送邮件报告')
parser.add_argument('--email-to', '-e', help='指定收件人邮箱(可多次使用)', action='append')
args = parser.parse_args()
system = PolicyRetrievalSystem(config_path=args.config)
if args.email_to:
system.recipients = args.email_to
system.config.setdefault('notification', {}).setdefault('email', {})['to_addrs'] = args.email_to
system.logger.info(f"邮件将发送到: {system.recipients}")
send_email = not args.no_email
if args.command == 'init':
system.init_config()
print("初始化完成,配置文件: config.yaml")
elif args.command == 'run':
try:
system.run(send_email=send_email)
except Exception as e:
error_msg = f"任务执行失败: {str(e)}"
system.logger.error(error_msg)
if system.notifier.is_enabled() and system.recipients:
system.notifier.send_error_alert(error_msg, system.recipients)
raise
elif args.command == 'schedule':
if args.time:
system.config['scheduler']['time'] = args.time
if args.enable:
system.config['scheduler']['enabled'] = True
elif args.disable:
system.config['scheduler']['enabled'] = False
print("定时任务已禁用")
return
with open(system.config_path, 'w', encoding='utf-8') as f:
yaml.dump(system.config, f, allow_unicode=True)
print(f"定时任务时间: {system.config['scheduler']['time']}")
print("启动定时任务...")
system.start_scheduler()
elif args.command == 'report':
output_dir = Path(__file__).parent / 'output'
if output_dir.exists():
reports = list(output_dir.glob('summary_*.xlsx'))
if reports:
latest = max(reports, key=lambda x: x.stat().st_mtime)
print(f"最新报告: {latest}")
df = pd.read_excel(latest)
print(df.to_string())
else:
print("暂无报告")
else:
print("暂无报告")
elif args.command == 'help':
parser.print_help()
if __name__ == '__main__':
main()