112 lines
1.9 KiB
YAML
112 lines
1.9 KiB
YAML
# 定时任务配置
|
|
scheduler:
|
|
enabled: true
|
|
time: "09:00"
|
|
days:
|
|
- mon
|
|
- tue
|
|
- wed
|
|
- thu
|
|
- fri
|
|
max_instances: 3
|
|
coalesce: true
|
|
|
|
# 目标网站配置
|
|
targets:
|
|
- name: "国家税务总局"
|
|
url: "https://www.chinatax.gov.cn/"
|
|
list_paths:
|
|
- "/npsite/chinatax/zcwj/"
|
|
- "/npsite/chinatax/tzgg/"
|
|
keywords:
|
|
- "最新"
|
|
- "通知"
|
|
- "公告"
|
|
- "政策"
|
|
- "法规"
|
|
enabled: true
|
|
|
|
- name: "财政部"
|
|
url: "https://www.mof.gov.cn/"
|
|
list_paths:
|
|
- "/zhengwugongkai/zhengceku/zhengcefagui/"
|
|
keywords:
|
|
- "最新"
|
|
- "通知"
|
|
- "公告"
|
|
- "政策"
|
|
- "法规"
|
|
enabled: false
|
|
|
|
- name: "国家税务局"
|
|
url: "http://www.chinatax.gov.cn/"
|
|
list_paths:
|
|
- "/cloudfw/zcwj/"
|
|
keywords:
|
|
- "最新"
|
|
- "通知"
|
|
- "公告"
|
|
- "政策"
|
|
- "法规"
|
|
enabled: false
|
|
|
|
# 下载配置
|
|
download:
|
|
path: "./downloads"
|
|
formats:
|
|
- pdf
|
|
- doc
|
|
- docx
|
|
- txt
|
|
- xlsx
|
|
max_size: 52428800
|
|
timeout: 60
|
|
retry: 3
|
|
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
|
|
# 去重配置
|
|
deduplication:
|
|
title_similarity: 0.8
|
|
content_similarity: 0.9
|
|
hash_algorithm: "simhash"
|
|
|
|
# 分类配置
|
|
categories:
|
|
- name: "税收政策"
|
|
keywords:
|
|
- "税收"
|
|
- "税务"
|
|
- "纳税"
|
|
- "税费"
|
|
- "增值税"
|
|
- "所得税"
|
|
priority: 1
|
|
|
|
- name: "通知公告"
|
|
keywords:
|
|
- "通知"
|
|
- "公告"
|
|
- "通告"
|
|
priority: 2
|
|
|
|
- name: "法规文件"
|
|
keywords:
|
|
- "法规"
|
|
- "条例"
|
|
- "规章"
|
|
- "办法"
|
|
- "细则"
|
|
priority: 3
|
|
|
|
- name: "其他政策"
|
|
keywords: []
|
|
priority: 99
|
|
|
|
# 日志配置
|
|
logging:
|
|
level: "INFO"
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
file: "./logs/policy_retrieval.log"
|
|
max_bytes: 10485760
|
|
backup_count: 5
|