Initial commit
This commit is contained in:
733
backend/app/services/domain.py
Normal file
733
backend/app/services/domain.py
Normal file
@ -0,0 +1,733 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from collections import Counter
|
||||
from datetime import datetime, time, timedelta
|
||||
from typing import Iterable
|
||||
from urllib.parse import unquote, urlparse
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.models import (
|
||||
Account,
|
||||
ClsNewsDocument,
|
||||
ClsNewsItem,
|
||||
ClsNewsSummary,
|
||||
ClsSectorImpact,
|
||||
DailyInputAccount,
|
||||
DailyInputDocument,
|
||||
DailyInputUpsertPayload,
|
||||
OpinionArticle,
|
||||
ReportDocument,
|
||||
ReportListItem,
|
||||
)
|
||||
from app.services.storage import (
|
||||
fetch_accounts,
|
||||
fetch_cls_news_document,
|
||||
fetch_daily_input_document,
|
||||
fetch_report_document,
|
||||
fetch_report_list,
|
||||
save_accounts,
|
||||
save_cls_news_document,
|
||||
save_daily_input_document,
|
||||
save_report_document,
|
||||
)
|
||||
|
||||
SHANGHAI = ZoneInfo("Asia/Shanghai")
|
||||
CLS_REFRESH_INTERVAL = timedelta(minutes=3)
|
||||
CLS_TELEGRAPH_URL = "https://m.cls.cn/telegraph"
|
||||
HTTP_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
}
|
||||
|
||||
SENTIMENT_BULL = "\u770b\u591a"
|
||||
SENTIMENT_BEAR = "\u770b\u7a7a"
|
||||
SENTIMENT_NEUTRAL = "\u4e2d\u6027"
|
||||
|
||||
ACCOUNTS: list[Account] = [
|
||||
Account(
|
||||
id="touzi-mingjian",
|
||||
name="\u6295\u8d44\u660e\u89c1",
|
||||
description="\u504f\u4e3b\u9898\u8f6e\u52a8\u4e0e\u4e3b\u7ebf\u5224\u65ad\uff0c\u9002\u5408\u8ddf\u8e2a\u5e02\u573a\u504f\u597d\u53d8\u5316\u3002",
|
||||
),
|
||||
Account(
|
||||
id="aigujun-2020",
|
||||
name="\u7231\u80a1\u541b2020",
|
||||
description="\u5173\u6ce8\u60c5\u7eea\u3001\u70ed\u70b9\u6269\u6563\u4e0e\u4ea4\u6613\u7ec6\u8282\u3002",
|
||||
),
|
||||
Account(
|
||||
id="mazhiming-shouping",
|
||||
name="\u9a6c\u5fd7\u660e\u6536\u8bc4",
|
||||
description="\u65e5\u5185\u6536\u8bc4\u4e0e\u60c5\u7eea\u53d8\u5316\u603b\u7ed3\u3002",
|
||||
),
|
||||
Account(
|
||||
id="laobai-guandian",
|
||||
name="\u8001\u767d\u5206\u6790\u5ba4\u89c2\u70b9",
|
||||
description="\u504f\u7b56\u7565\u62c6\u89e3\u548c\u5173\u952e\u677f\u5757\u8ddf\u8e2a\u3002",
|
||||
),
|
||||
]
|
||||
|
||||
ACCOUNT_FOCUS = {
|
||||
"touzi-mingjian": ["AI", "\u7b97\u529b", "\u673a\u5668\u4eba"],
|
||||
"aigujun-2020": ["CPO", "\u5b58\u50a8\u82af\u7247", "\u65b0\u80fd\u6e90"],
|
||||
"mazhiming-shouping": ["AI", "\u5238\u5546", "\u6c7d\u8f66"],
|
||||
"laobai-guandian": ["\u673a\u5668\u4eba", "\u534a\u5bfc\u4f53", "\u65b0\u80fd\u6e90"],
|
||||
}
|
||||
|
||||
SECTOR_KEYWORDS = {
|
||||
"AI": ["ai", "\u4eba\u5de5\u667a\u80fd", "\u5927\u6a21\u578b", "\u6a21\u578b"],
|
||||
"\u7b97\u529b": ["\u7b97\u529b", "compute", "server", "gpu"],
|
||||
"CPO": ["cpo", "\u5149\u6a21\u5757", "\u9ad8\u901f\u4e92\u8054"],
|
||||
"\u5b58\u50a8\u82af\u7247": ["\u5b58\u50a8", "memory", "dram", "nand"],
|
||||
"\u534a\u5bfc\u4f53": ["\u534a\u5bfc\u4f53", "chip", "wafer", "\u6676\u5706"],
|
||||
"\u5238\u5546": ["\u5238\u5546", "broker", "\u8bc1\u5238"],
|
||||
"\u77f3\u6cb9\u5929\u7136\u6c14": ["\u77f3\u6cb9", "\u5929\u7136\u6c14", "\u6cb9\u6c14", "\u80fd\u6e90\u4ef7\u683c"],
|
||||
"\u65b0\u80fd\u6e90": ["\u65b0\u80fd\u6e90", "\u9502\u7535", "\u5149\u4f0f", "\u50a8\u80fd"],
|
||||
"\u519b\u5de5": ["\u519b\u5de5", "\u536b\u661f", "\u822a\u5929"],
|
||||
"\u673a\u5668\u4eba": ["\u673a\u5668\u4eba", "robot", "\u81ea\u52a8\u5316"],
|
||||
"\u6c7d\u8f66": ["\u6c7d\u8f66", "\u8f66\u4f01", "\u667a\u9a7e", "\u6574\u8f66"],
|
||||
"\u533b\u836f": ["\u533b\u836f", "\u521b\u65b0\u836f", "\u533b\u7597"],
|
||||
}
|
||||
|
||||
POSITIVE_KEYWORDS = [
|
||||
"\u673a\u4f1a",
|
||||
"\u4fee\u590d",
|
||||
"\u589e\u5f3a",
|
||||
"\u4e3b\u7ebf",
|
||||
"\u589e\u91cf",
|
||||
"\u53cd\u5f39",
|
||||
"\u7a81\u7834",
|
||||
"\u79ef\u6781",
|
||||
"up",
|
||||
"bull",
|
||||
]
|
||||
NEGATIVE_KEYWORDS = [
|
||||
"\u98ce\u9669",
|
||||
"\u627f\u538b",
|
||||
"\u8c28\u614e",
|
||||
"\u56de\u8c03",
|
||||
"\u7f29\u91cf",
|
||||
"\u89c2\u671b",
|
||||
"\u5206\u6b67",
|
||||
"bear",
|
||||
"down",
|
||||
]
|
||||
|
||||
ARTICLE_TYPE_PATTERNS = [
|
||||
("\u6536\u8bc4", "\u5e02\u573a\u6536\u8bc4"),
|
||||
("\u5348", "\u76d8\u4e2d\u89c2\u5bdf"),
|
||||
("\u7b56\u7565", "\u7b56\u7565\u8ddf\u8e2a"),
|
||||
("\u590d\u76d8", "\u76d8\u9762\u590d\u76d8"),
|
||||
("\u884c\u4e1a", "\u884c\u4e1a\u89c2\u5bdf"),
|
||||
]
|
||||
|
||||
CLS_NEWS_TEMPLATES = [
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e\u76d8\u524d\u7cbe\u9009\uff1a\u7b97\u529b\u94fe\u56de\u6696\uff0c\u8d44\u91d1\u91cd\u65b0\u805a\u7126\u9ad8\u666f\u6c14\u65b9\u5411",
|
||||
"summary": "\u9694\u591c\u5e02\u573a\u98ce\u9669\u504f\u597d\u56de\u5347\uff0c\u7b97\u529b\u4e0e\u670d\u52a1\u5668\u94fe\u6761\u83b7\u8d44\u91d1\u91cd\u65b0\u914d\u7f6e\uff0c\u60c5\u7eea\u4fee\u590d\u5148\u4e8e\u6210\u4ea4\u5168\u9762\u653e\u5927\u3002",
|
||||
"sectors": ["\u7b97\u529b", "AI"],
|
||||
"sentiment": SENTIMENT_BULL,
|
||||
"reference_url": "https://www.cls.cn/detail/compute-rebound",
|
||||
},
|
||||
{
|
||||
"title": "AI Daily\uff1aCPO \u4e0e\u5b58\u50a8\u82af\u7247\u540c\u6b65\u8d70\u5f3a\uff0c\u666f\u6c14\u5ea6\u7ebf\u7d22\u5ef6\u7eed",
|
||||
"summary": "\u9ad8\u901f\u4e92\u8054\u4e0e\u5b58\u50a8\u62a5\u4ef7\u9884\u671f\u652f\u6491\u677f\u5757\u8868\u73b0\uff0c\u8d44\u91d1\u66f4\u503e\u5411\u4e8e\u56f4\u7ed5\u786e\u5b9a\u6027\u73af\u8282\u96c6\u4e2d\u3002",
|
||||
"sectors": ["CPO", "\u5b58\u50a8\u82af\u7247"],
|
||||
"sentiment": SENTIMENT_BULL,
|
||||
"reference_url": "https://www.cls.cn/detail/ai-daily-cpo-memory",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e\u884c\u4e1a\u89c2\u5bdf\uff1a\u673a\u5668\u4eba\u94fe\u6761\u5206\u5316\uff0c\u8ba2\u5355\u5151\u73b0\u6210\u4e3a\u77ed\u671f\u7126\u70b9",
|
||||
"summary": "\u673a\u5668\u4eba\u65b9\u5411\u5185\u90e8\u5f00\u59cb\u51fa\u73b0\u5151\u73b0\u4e0e\u6362\u624b\uff0c\u5e02\u573a\u4ece\u6982\u5ff5\u6269\u6563\u8f6c\u5411\u4e1a\u7ee9\u4e0e\u8ba2\u5355\u9a8c\u8bc1\u3002",
|
||||
"sectors": ["\u673a\u5668\u4eba"],
|
||||
"sentiment": SENTIMENT_NEUTRAL,
|
||||
"reference_url": "https://www.cls.cn/detail/robotics-orders",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e\u80fd\u6e90\u8ffd\u8e2a\uff1a\u6cb9\u6c14\u677f\u5757\u9ad8\u4f4d\u9707\u8361\uff0c\u8d44\u91d1\u5207\u5411\u9632\u5fa1\u54c1\u79cd",
|
||||
"summary": "\u539f\u6cb9\u4ef7\u683c\u7ef4\u6301\u9ad8\u4f4d\u540e\uff0c\u6cb9\u6c14\u65b9\u5411\u51fa\u73b0\u9ad8\u4f4d\u9707\u8361\uff0c\u90e8\u5206\u8d44\u91d1\u8f6c\u5411\u533b\u836f\u7b49\u9632\u5b88\u677f\u5757\u3002",
|
||||
"sectors": ["\u77f3\u6cb9\u5929\u7136\u6c14", "\u533b\u836f"],
|
||||
"sentiment": SENTIMENT_NEUTRAL,
|
||||
"reference_url": "https://www.cls.cn/detail/energy-rotation",
|
||||
},
|
||||
{
|
||||
"title": "AI Daily\uff1a\u6c7d\u8f66\u4e0e\u667a\u9a7e\u5ef6\u7eed\u5206\u6b67\uff0c\u4e3b\u7ebf\u4ecd\u9700\u7b49\u5f85\u9500\u91cf\u6570\u636e\u9a8c\u8bc1",
|
||||
"summary": "\u6574\u8f66\u4e0e\u667a\u9a7e\u65b9\u5411\u5173\u6ce8\u5ea6\u4ecd\u9ad8\uff0c\u4f46\u5e02\u573a\u5bf9\u4f30\u503c\u6269\u5f20\u5df2\u6709\u4fdd\u7559\uff0c\u7b49\u5f85\u9500\u91cf\u548c\u8ba2\u5355\u6570\u636e\u786e\u8ba4\u3002",
|
||||
"sectors": ["\u6c7d\u8f66"],
|
||||
"sentiment": SENTIMENT_BEAR,
|
||||
"reference_url": "https://www.cls.cn/detail/auto-data-watch",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u534a\u5bfc\u4f53\u8bbe\u5907\u65b9\u5411\u8d70\u5f3a\uff0c\u673a\u6784\u79f0\u56fd\u4ea7\u66ff\u4ee3\u8282\u594f\u63d0\u901f",
|
||||
"summary": "\u6676\u5706\u5236\u9020\u4e0e\u8bbe\u5907\u94fe\u6761\u51fa\u73b0\u5f02\u52a8\uff0c\u5e02\u573a\u56f4\u7ed5\u56fd\u4ea7\u66ff\u4ee3\u548c\u8d44\u672c\u5f00\u652f\u6062\u590d\u91cd\u65b0\u5b9a\u4ef7\u3002",
|
||||
"sectors": ["\u534a\u5bfc\u4f53"],
|
||||
"sentiment": SENTIMENT_BULL,
|
||||
"reference_url": "https://www.cls.cn/detail/semi-equipment-up",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u5238\u5546\u677f\u5757\u5348\u540e\u62c9\u5347\uff0c\u5e02\u573a\u60c5\u7eea\u6709\u6240\u4fee\u590d",
|
||||
"summary": "\u6307\u6570\u9707\u8361\u8fc7\u7a0b\u4e2d\u5238\u5546\u627f\u62c5\u60c5\u7eea\u4fee\u590d\u529f\u80fd\uff0c\u5e26\u52a8\u90e8\u5206\u9ad8\u5f39\u6027\u65b9\u5411\u56de\u6696\u3002",
|
||||
"sectors": ["\u5238\u5546"],
|
||||
"sentiment": SENTIMENT_BULL,
|
||||
"reference_url": "https://www.cls.cn/detail/broker-rebound",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u521b\u65b0\u836f\u65b9\u5411\u6301\u7eed\u6d3b\u8dc3\uff0c\u8d44\u91d1\u8f6c\u5411\u9632\u5b88\u4e0e\u6210\u957f\u517c\u987e",
|
||||
"summary": "\u533b\u836f\u677f\u5757\u83b7\u5f97\u589e\u91cf\u8d44\u91d1\u5173\u6ce8\uff0c\u521b\u65b0\u836f\u548c\u5668\u68b0\u7ec6\u5206\u8868\u73b0\u66f4\u5f3a\u3002",
|
||||
"sectors": ["\u533b\u836f"],
|
||||
"sentiment": SENTIMENT_NEUTRAL,
|
||||
"reference_url": "https://www.cls.cn/detail/medical-active",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u65b0\u80fd\u6e90\u94fe\u6761\u5206\u5316\u52a0\u5267\uff0c\u673a\u6784\u63d0\u9192\u5173\u6ce8\u4ea7\u80fd\u51fa\u6e05\u8282\u594f",
|
||||
"summary": "\u65b0\u80fd\u6e90\u677f\u5757\u5185\u90e8\u8f6e\u52a8\u660e\u663e\uff0c\u8d44\u91d1\u66f4\u504f\u5411\u4f4e\u4f4d\u73af\u8282\u548c\u6210\u672c\u6539\u5584\u65b9\u5411\u3002",
|
||||
"sectors": ["\u65b0\u80fd\u6e90"],
|
||||
"sentiment": SENTIMENT_NEUTRAL,
|
||||
"reference_url": "https://www.cls.cn/detail/new-energy-split",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u519b\u5de5\u677f\u5757\u76d8\u4e2d\u5f02\u52a8\uff0c\u8ba2\u5355\u5151\u73b0\u9884\u671f\u91cd\u65b0\u5347\u6e29",
|
||||
"summary": "\u519b\u5de5\u94fe\u6761\u76d8\u4e2d\u8d70\u5f3a\uff0c\u5e02\u573a\u5173\u6ce8\u540e\u7eed\u8ba2\u5355\u5151\u73b0\u4e0e\u4f30\u503c\u5207\u6362\u7a7a\u95f4\u3002",
|
||||
"sectors": ["\u519b\u5de5"],
|
||||
"sentiment": SENTIMENT_BULL,
|
||||
"reference_url": "https://www.cls.cn/detail/defense-orders",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u673a\u5668\u4eba\u677f\u5757\u51b2\u9ad8\u56de\u843d\uff0c\u77ed\u7ebf\u535a\u5f08\u60c5\u7eea\u5347\u6e29",
|
||||
"summary": "\u673a\u5668\u4eba\u65b9\u5411\u9ad8\u4f4d\u9707\u8361\uff0c\u8d44\u91d1\u5728\u9898\u6750\u6269\u6563\u4e0e\u5151\u73b0\u538b\u529b\u4e4b\u95f4\u53cd\u590d\u5207\u6362\u3002",
|
||||
"sectors": ["\u673a\u5668\u4eba"],
|
||||
"sentiment": SENTIMENT_NEUTRAL,
|
||||
"reference_url": "https://www.cls.cn/detail/robotics-intraday",
|
||||
},
|
||||
{
|
||||
"title": "\u8d22\u8054\u793e7x24\uff1a\u5b58\u50a8\u82af\u7247\u62a5\u4ef7\u9884\u671f\u7ee7\u7eed\u4e0a\u4fee\uff0c\u4ea7\u4e1a\u94fe\u666f\u6c14\u5ea6\u53d7\u5173\u6ce8",
|
||||
"summary": "\u5b58\u50a8\u73af\u8282\u4ef7\u683c\u4fee\u590d\u903b\u8f91\u5ef6\u7eed\uff0c\u5e02\u573a\u91cd\u65b0\u4ea4\u6613\u4f9b\u9700\u6539\u5584\u4e0e\u76c8\u5229\u5f39\u6027\u3002",
|
||||
"sectors": ["\u5b58\u50a8\u82af\u7247"],
|
||||
"sentiment": SENTIMENT_BULL,
|
||||
"reference_url": "https://www.cls.cn/detail/memory-price-up",
|
||||
},
|
||||
]
|
||||
|
||||
SAMPLE_INPUTS = {
|
||||
1: {
|
||||
"touzi-mingjian": ["https://mp.weixin.qq.com/s/semiconductor-capacity-and-chip-cycle"],
|
||||
"aigujun-2020": ["https://mp.weixin.qq.com/s/storage-chip-price-repair"],
|
||||
"mazhiming-shouping": ["https://mp.weixin.qq.com/s/market-close-sector-rotation"],
|
||||
"laobai-guandian": ["https://mp.weixin.qq.com/s/robotics-and-energy-balance"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def now_local() -> datetime:
|
||||
return datetime.now(SHANGHAI)
|
||||
|
||||
|
||||
def iso_timestamp(value: datetime | None = None) -> str:
|
||||
return (value or now_local()).replace(microsecond=0).isoformat()
|
||||
|
||||
|
||||
def ensure_local_timezone(value: datetime) -> datetime:
|
||||
if value.tzinfo is None:
|
||||
return value.replace(tzinfo=SHANGHAI)
|
||||
return value.astimezone(SHANGHAI)
|
||||
|
||||
|
||||
def normalize_whitespace(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
|
||||
|
||||
def extract_json_object(script_text: str, marker: str) -> str:
|
||||
marker_index = script_text.find(marker)
|
||||
if marker_index < 0:
|
||||
raise RuntimeError(f"Marker not found: {marker}")
|
||||
|
||||
start = script_text.find("{", marker_index)
|
||||
if start < 0:
|
||||
raise RuntimeError(f"JSON object start not found for marker: {marker}")
|
||||
|
||||
depth = 0
|
||||
in_string = False
|
||||
escaped = False
|
||||
for index in range(start, len(script_text)):
|
||||
char = script_text[index]
|
||||
if in_string:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif char == "\\":
|
||||
escaped = True
|
||||
elif char == '"':
|
||||
in_string = False
|
||||
continue
|
||||
|
||||
if char == '"':
|
||||
in_string = True
|
||||
continue
|
||||
if char == "{":
|
||||
depth += 1
|
||||
continue
|
||||
if char == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return script_text[start : index + 1]
|
||||
|
||||
raise RuntimeError(f"JSON object end not found for marker: {marker}")
|
||||
|
||||
|
||||
def parse_telegraph_timestamp(date_str: str, time_str: str) -> str:
|
||||
normalized_time = time_str if len(time_str.split(":")) == 3 else f"{time_str}:00"
|
||||
return datetime.fromisoformat(f"{date_str}T{normalized_time}").replace(tzinfo=SHANGHAI).isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def split_title_and_summary(content: str) -> tuple[str, str]:
|
||||
cleaned = normalize_whitespace(content)
|
||||
bracket_match = re.match(r"^[\[({\u3010\u3016](.+?)[\])}\u3011\u3017][\uff1a: ]*(.*)$", cleaned)
|
||||
if bracket_match:
|
||||
title = normalize_whitespace(bracket_match.group(1))
|
||||
summary = normalize_whitespace(bracket_match.group(2) or cleaned)
|
||||
return title[:80], summary or title
|
||||
|
||||
sentence_parts = re.split(r"[。;;!?!?]", cleaned, maxsplit=1)
|
||||
title = sentence_parts[0][:80]
|
||||
summary = cleaned if len(cleaned) <= 220 else f"{cleaned[:217]}..."
|
||||
return title, summary
|
||||
|
||||
|
||||
def build_fallback_cls_items(reference_time: datetime) -> list[ClsNewsItem]:
|
||||
items: list[ClsNewsItem] = []
|
||||
for index, template in enumerate(CLS_NEWS_TEMPLATES):
|
||||
published_at = (reference_time - timedelta(minutes=index * 95 + 8)).replace(microsecond=0).isoformat()
|
||||
items.append(
|
||||
ClsNewsItem(
|
||||
id=f"cls-{index + 1}",
|
||||
title=template["title"],
|
||||
published_at=published_at,
|
||||
source="\u8d22\u8054\u793e" if index % 2 == 0 else "\u8d22\u8054\u793e AI Daily",
|
||||
summary=template["summary"],
|
||||
reference_url=template["reference_url"],
|
||||
sectors=template["sectors"],
|
||||
sentiment=template["sentiment"],
|
||||
)
|
||||
)
|
||||
return sorted(items, key=lambda item: item.published_at, reverse=True)
|
||||
|
||||
|
||||
def fetch_cls_telegraph_items(reference_time: datetime) -> list[ClsNewsItem]:
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
response = session.get(CLS_TELEGRAPH_URL, headers=HTTP_HEADERS, timeout=15)
|
||||
response.raise_for_status()
|
||||
response.encoding = "utf-8"
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
next_data_script = None
|
||||
for script in soup.find_all("script"):
|
||||
script_text = script.string or script.get_text()
|
||||
if "__NEXT_DATA__ =" in script_text:
|
||||
next_data_script = script_text
|
||||
break
|
||||
if not next_data_script:
|
||||
raise RuntimeError("Missing __NEXT_DATA__ payload on cls.cn")
|
||||
|
||||
next_data = json.loads(extract_json_object(next_data_script, "__NEXT_DATA__ ="))
|
||||
roll_data = (
|
||||
next_data.get("props", {})
|
||||
.get("initialState", {})
|
||||
.get("roll_data", [])
|
||||
)
|
||||
if not isinstance(roll_data, list) or not roll_data:
|
||||
raise RuntimeError("Missing roll_data in cls.cn payload")
|
||||
|
||||
target_date = reference_time.date()
|
||||
items: list[ClsNewsItem] = []
|
||||
seen_ids: set[int] = set()
|
||||
latest_limit = 80
|
||||
for entry in roll_data:
|
||||
if len(items) >= latest_limit:
|
||||
break
|
||||
|
||||
item_id = int(entry.get("id") or 0)
|
||||
if not item_id or item_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(item_id)
|
||||
|
||||
timestamp = int(entry.get("modified_time") or entry.get("ctime") or 0)
|
||||
if not timestamp:
|
||||
continue
|
||||
published_dt = datetime.fromtimestamp(timestamp, tz=SHANGHAI)
|
||||
if published_dt.date() != target_date:
|
||||
continue
|
||||
|
||||
raw_content = normalize_whitespace(
|
||||
entry.get("content")
|
||||
or entry.get("brief")
|
||||
or entry.get("title")
|
||||
or ""
|
||||
)
|
||||
if len(raw_content) < 8:
|
||||
continue
|
||||
|
||||
title = normalize_whitespace(entry.get("title") or "")
|
||||
if not title:
|
||||
title, _ = split_title_and_summary(raw_content)
|
||||
|
||||
summary = normalize_whitespace(entry.get("brief") or "")
|
||||
if not summary:
|
||||
_, summary = split_title_and_summary(raw_content)
|
||||
|
||||
source = normalize_whitespace(entry.get("author") or "\u8d22\u8054\u793e7x24")
|
||||
reference_url = normalize_whitespace(entry.get("shareurl") or "")
|
||||
if not reference_url:
|
||||
reference_url = f"https://www.cls.cn/detail/{item_id}"
|
||||
|
||||
sectors = infer_sectors(f"{title} {summary}", "touzi-mingjian")
|
||||
sentiment = infer_sentiment(f"{title} {summary}")
|
||||
items.append(
|
||||
ClsNewsItem(
|
||||
id=f"cls-live-{item_id}",
|
||||
title=title[:120],
|
||||
published_at=published_dt.isoformat(timespec="seconds"),
|
||||
source=source,
|
||||
summary=summary[:500],
|
||||
reference_url=reference_url,
|
||||
sectors=sectors,
|
||||
sentiment=sentiment,
|
||||
)
|
||||
)
|
||||
|
||||
if not items:
|
||||
raise RuntimeError("No telegraph items parsed from cls.cn")
|
||||
|
||||
return sorted(items, key=lambda item: item.published_at, reverse=True)
|
||||
|
||||
|
||||
def get_accounts() -> list[Account]:
|
||||
records = fetch_accounts()
|
||||
return records or ACCOUNTS
|
||||
|
||||
|
||||
def normalize_date(value: str) -> str:
|
||||
return datetime.fromisoformat(value).date().isoformat()
|
||||
|
||||
|
||||
def blank_daily_input(date_str: str) -> DailyInputDocument:
|
||||
return DailyInputDocument(
|
||||
date=date_str,
|
||||
updated_at=iso_timestamp(),
|
||||
accounts=[
|
||||
DailyInputAccount(account_id=account.id, account_name=account.name, links=[])
|
||||
for account in get_accounts()
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def clean_links(links: Iterable[str]) -> list[str]:
|
||||
normalized: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for raw_link in links:
|
||||
link = raw_link.strip()
|
||||
if not link or link in seen:
|
||||
continue
|
||||
seen.add(link)
|
||||
normalized.append(link)
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_daily_input(date_str: str, payload: DailyInputUpsertPayload) -> DailyInputDocument:
|
||||
payload_map = {item.account_id: clean_links(item.links) for item in payload.accounts}
|
||||
return DailyInputDocument(
|
||||
date=date_str,
|
||||
updated_at=iso_timestamp(),
|
||||
accounts=[
|
||||
DailyInputAccount(
|
||||
account_id=account.id,
|
||||
account_name=account.name,
|
||||
links=payload_map.get(account.id, []),
|
||||
)
|
||||
for account in get_accounts()
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def load_daily_input(date_str: str) -> DailyInputDocument:
|
||||
payload = fetch_daily_input_document(date_str)
|
||||
if payload is None:
|
||||
return blank_daily_input(date_str)
|
||||
return payload
|
||||
|
||||
|
||||
def save_daily_input(document: DailyInputDocument) -> DailyInputDocument:
|
||||
return save_daily_input_document(document)
|
||||
|
||||
|
||||
def load_report(date_str: str) -> ReportDocument | None:
|
||||
return fetch_report_document(date_str)
|
||||
|
||||
|
||||
def save_report(document: ReportDocument) -> ReportDocument:
|
||||
return save_report_document(document)
|
||||
|
||||
|
||||
def list_reports() -> list[ReportListItem]:
|
||||
return fetch_report_list()
|
||||
|
||||
|
||||
def title_from_link(account_name: str, url: str, index: int) -> str:
|
||||
text = unquote(urlparse(url).path or url)
|
||||
tokens = [
|
||||
token
|
||||
for token in re.split(r"[\W_]+", text.lower())
|
||||
if token and token not in {"s", "mp", "weixin", "qq", "com"}
|
||||
]
|
||||
meaningful = [token for token in tokens if len(token) > 1]
|
||||
if meaningful:
|
||||
topic = " / ".join(token.upper() if len(token) <= 3 else token.capitalize() for token in meaningful[:3])
|
||||
return f"{account_name}\uff1a{topic} \u89c2\u5bdf"
|
||||
return f"{account_name}\uff1a\u5e02\u573a\u8ddf\u8e2a\u7b2c {index + 1} \u6761"
|
||||
|
||||
|
||||
def infer_sectors(text: str, account_id: str) -> list[str]:
|
||||
lowered = text.lower()
|
||||
sectors = [
|
||||
sector
|
||||
for sector, keywords in SECTOR_KEYWORDS.items()
|
||||
if any(keyword.lower() in lowered for keyword in keywords)
|
||||
]
|
||||
if sectors:
|
||||
return sectors[:3]
|
||||
return ACCOUNT_FOCUS.get(account_id, ["AI", "\u7b97\u529b"])[:2]
|
||||
|
||||
|
||||
def infer_sentiment(text: str) -> str:
|
||||
lowered = text.lower()
|
||||
positive = sum(keyword.lower() in lowered for keyword in POSITIVE_KEYWORDS)
|
||||
negative = sum(keyword.lower() in lowered for keyword in NEGATIVE_KEYWORDS)
|
||||
if positive > negative:
|
||||
return SENTIMENT_BULL
|
||||
if negative > positive:
|
||||
return SENTIMENT_BEAR
|
||||
return SENTIMENT_NEUTRAL
|
||||
|
||||
|
||||
def infer_article_type(title: str) -> str:
|
||||
lowered = title.lower()
|
||||
for keyword, article_type in ARTICLE_TYPE_PATTERNS:
|
||||
if keyword.lower() in lowered:
|
||||
return article_type
|
||||
return "\u4e3b\u9898\u89c2\u70b9"
|
||||
|
||||
|
||||
def build_article_summary(title: str, sectors: list[str], sentiment: str) -> str:
|
||||
sector_text = "\u3001".join(sectors[:2]) if sectors else "\u6838\u5fc3\u4e3b\u7ebf"
|
||||
sentiment_text = {
|
||||
SENTIMENT_BULL: "\u504f\u79ef\u6781\u7684\u8282\u594f\u5224\u65ad",
|
||||
SENTIMENT_BEAR: "\u660e\u663e\u504f\u8c28\u614e\u7684\u98ce\u9669\u63d0\u9192",
|
||||
SENTIMENT_NEUTRAL: "\u66f4\u5f3a\u8c03\u7ed3\u6784\u5206\u5316\u4e0e\u7b49\u5f85\u786e\u8ba4",
|
||||
}[sentiment]
|
||||
return f"{title} \u56f4\u7ed5 {sector_text} \u5c55\u5f00\uff0c\u7ed9\u51fa\u7684\u7ed3\u8bba\u662f{sentiment_text}\uff0c\u9002\u5408\u4f5c\u4e3a\u5f53\u65e5\u76d8\u9762\u8ddf\u8e2a\u4e0e\u590d\u76d8\u53c2\u8003\u3002"
|
||||
|
||||
|
||||
def generate_report(date_str: str, input_document: DailyInputDocument) -> ReportDocument:
|
||||
base_date = datetime.fromisoformat(date_str)
|
||||
articles: list[OpinionArticle] = []
|
||||
for account_index, account in enumerate(input_document.accounts):
|
||||
for link_index, url in enumerate(account.links):
|
||||
title = title_from_link(account.account_name, url, link_index)
|
||||
sectors = infer_sectors(f"{title} {url}", account.account_id)
|
||||
sentiment = infer_sentiment(f"{title} {url}")
|
||||
published_at = (
|
||||
base_date.replace(hour=9 + ((account_index + link_index) % 8), minute=(link_index * 12) % 60)
|
||||
.replace(tzinfo=SHANGHAI)
|
||||
.isoformat(timespec="seconds")
|
||||
)
|
||||
articles.append(
|
||||
OpinionArticle(
|
||||
id=f"{date_str}-{account.account_id}-{link_index}",
|
||||
account_id=account.account_id,
|
||||
account_name=account.account_name,
|
||||
title=title,
|
||||
published_at=published_at,
|
||||
summary=build_article_summary(title, sectors, sentiment),
|
||||
source_url=url,
|
||||
sectors=sectors,
|
||||
sentiment=sentiment,
|
||||
article_type=infer_article_type(title),
|
||||
)
|
||||
)
|
||||
|
||||
if not articles:
|
||||
return ReportDocument(
|
||||
date=date_str,
|
||||
generated_at=iso_timestamp(),
|
||||
summary="\u5f53\u65e5\u5c1a\u672a\u5f55\u5165\u6587\u7ae0\u94fe\u63a5\uff0c\u7cfb\u7edf\u5df2\u4fdd\u7559\u65e5\u62a5\u7ed3\u6784\uff0c\u7b49\u5f85\u8865\u5145\u516c\u4f17\u53f7\u6587\u7ae0\u540e\u518d\u751f\u6210\u5b8c\u6574\u7ed3\u8bba\u3002",
|
||||
focus_sectors=[],
|
||||
article_count=0,
|
||||
account_count=0,
|
||||
articles=[],
|
||||
)
|
||||
|
||||
sector_counter = Counter(sector for article in articles for sector in article.sectors)
|
||||
focus_sectors = [sector for sector, _count in sector_counter.most_common(4)]
|
||||
|
||||
sentiment_counter = Counter(article.sentiment for article in articles)
|
||||
if sentiment_counter[SENTIMENT_BULL] > sentiment_counter[SENTIMENT_BEAR]:
|
||||
tone = "\u6574\u4f53\u504f\u79ef\u6781\uff0c\u4e3b\u7ebf\u8ba8\u8bba\u96c6\u4e2d\u5ea6\u8f83\u9ad8"
|
||||
elif sentiment_counter[SENTIMENT_BEAR] > sentiment_counter[SENTIMENT_BULL]:
|
||||
tone = "\u6574\u4f53\u504f\u8c28\u614e\uff0c\u98ce\u9669\u63a7\u5236\u4ecd\u662f\u4e3b\u53d9\u4e8b"
|
||||
else:
|
||||
tone = "\u591a\u7a7a\u5206\u6b67\u5e76\u5b58\uff0c\u5e02\u573a\u66f4\u770b\u91cd\u9a8c\u8bc1\u4e0e\u8282\u594f"
|
||||
|
||||
active_accounts = len([account for account in input_document.accounts if account.links])
|
||||
sector_text = "\u3001".join(focus_sectors) if focus_sectors else "\u6682\u65e0\u805a\u7126\u677f\u5757"
|
||||
summary = (
|
||||
f"{date_str} \u5171\u6574\u7406 {len(articles)} \u7bc7\u516c\u4f17\u53f7\u89c2\u70b9\uff0c\u8986\u76d6 {active_accounts} \u4e2a\u8d26\u6237\u3002"
|
||||
f"{tone}\uff0c\u8ba8\u8bba\u91cd\u70b9\u843d\u5728 {sector_text}\u3002"
|
||||
)
|
||||
|
||||
return ReportDocument(
|
||||
date=date_str,
|
||||
generated_at=iso_timestamp(),
|
||||
summary=summary,
|
||||
focus_sectors=focus_sectors,
|
||||
article_count=len(articles),
|
||||
account_count=active_accounts,
|
||||
articles=sorted(articles, key=lambda item: item.published_at, reverse=True),
|
||||
)
|
||||
|
||||
|
||||
def build_cls_news_document(
|
||||
reference_time: datetime | None = None,
|
||||
*,
|
||||
allow_live_fetch: bool = True,
|
||||
) -> ClsNewsDocument:
|
||||
current = reference_time or now_local()
|
||||
try:
|
||||
if allow_live_fetch:
|
||||
items = fetch_cls_telegraph_items(current)
|
||||
else:
|
||||
raise RuntimeError("Live fetch disabled for non-current date")
|
||||
except Exception:
|
||||
items = build_fallback_cls_items(current)
|
||||
|
||||
sector_counter = Counter(sector for item in items for sector in item.sectors)
|
||||
watch_list = [sector for sector, _count in sector_counter.most_common(5)]
|
||||
|
||||
overview = (
|
||||
"\u8d44\u8baf\u5217\u8868\u5c55\u793a\u6240\u9009\u65e5\u671f\u5185\u7684\u8d22\u8054\u793e 7x24 \u8d44\u8baf\uff0c"
|
||||
"\u5f53\u65e5\u6570\u636e\u6765\u81ea cls.cn \u5b9e\u65f6\u6293\u53d6\uff0c\u6bcf 3 \u5206\u949f\u66f4\u65b0\u4e00\u6b21\u3002"
|
||||
)
|
||||
hot_topics = (
|
||||
"\u70ed\u70b9\u6982\u89c8\u53ea\u4fdd\u7559\u5bf9\u677f\u5757\u5b58\u5728\u660e\u663e\u5f71\u54cd\u7684\u65b9\u5411\uff0c"
|
||||
f"\u5f53\u524d\u4e3b\u8981\u96c6\u4e2d\u5728 {'\u3001'.join(watch_list[:3])}\u3002"
|
||||
)
|
||||
|
||||
sector_impacts: list[ClsSectorImpact] = []
|
||||
seen_sectors: set[str] = set()
|
||||
for sector in watch_list[:4]:
|
||||
if sector in seen_sectors:
|
||||
continue
|
||||
seen_sectors.add(sector)
|
||||
related_items = [item for item in items if sector in item.sectors]
|
||||
if not related_items:
|
||||
continue
|
||||
|
||||
sentiment_counter = Counter(item.sentiment for item in related_items)
|
||||
if sentiment_counter[SENTIMENT_BULL] > sentiment_counter[SENTIMENT_BEAR]:
|
||||
sentiment = SENTIMENT_BULL
|
||||
reason = f"{sector} \u65b9\u5411\u51fa\u73b0\u50ac\u5316\u6216\u666f\u6c14\u5f3a\u5316\uff0c\u77ed\u7ebf\u504f\u6b63\u5411\u5f71\u54cd\u3002"
|
||||
elif sentiment_counter[SENTIMENT_BEAR] > sentiment_counter[SENTIMENT_BULL]:
|
||||
sentiment = SENTIMENT_BEAR
|
||||
reason = f"{sector} \u65b9\u5411\u51fa\u73b0\u5151\u73b0\u6216\u5206\u6b67\uff0c\u77ed\u7ebf\u504f\u8d1f\u5411\u5f71\u54cd\u3002"
|
||||
else:
|
||||
sentiment = SENTIMENT_NEUTRAL
|
||||
reason = f"{sector} \u65b9\u5411\u6709\u8ba8\u8bba\u4f46\u4ecd\u9700\u9a8c\u8bc1\uff0c\u77ed\u7ebf\u4ee5\u4e2d\u6027\u89c2\u5bdf\u4e3a\u4e3b\u3002"
|
||||
|
||||
sector_impacts.append(
|
||||
ClsSectorImpact(
|
||||
sector=sector,
|
||||
sentiment=sentiment,
|
||||
reason=reason,
|
||||
related_titles=list(dict.fromkeys(item.title for item in related_items[:2])),
|
||||
)
|
||||
)
|
||||
|
||||
return ClsNewsDocument(
|
||||
date=current.date().isoformat(),
|
||||
updated_at=iso_timestamp(current),
|
||||
window_label="\u5f53\u5929\u8d44\u8baf",
|
||||
summary=ClsNewsSummary(
|
||||
overview=overview,
|
||||
hot_topics=hot_topics,
|
||||
watch_list=watch_list,
|
||||
),
|
||||
sector_impacts=sector_impacts,
|
||||
items=items,
|
||||
)
|
||||
|
||||
|
||||
def load_cls_news(date_str: str) -> ClsNewsDocument | None:
|
||||
return fetch_cls_news_document(date_str)
|
||||
|
||||
|
||||
def build_reference_time(date_str: str) -> datetime:
|
||||
date_value = datetime.fromisoformat(date_str).date()
|
||||
if date_value == now_local().date():
|
||||
return now_local()
|
||||
return datetime.combine(date_value, time(hour=15, minute=0), tzinfo=SHANGHAI)
|
||||
|
||||
|
||||
def refresh_cls_news(date_str: str | None = None) -> ClsNewsDocument:
|
||||
normalized_date = normalize_date(date_str or now_local().date().isoformat())
|
||||
existing = load_cls_news(normalized_date)
|
||||
reference_time = build_reference_time(normalized_date)
|
||||
allow_live_fetch = normalized_date == now_local().date().isoformat()
|
||||
try:
|
||||
document = build_cls_news_document(reference_time, allow_live_fetch=allow_live_fetch)
|
||||
except Exception:
|
||||
if existing is not None:
|
||||
return existing
|
||||
raise
|
||||
return save_cls_news_document(document)
|
||||
|
||||
|
||||
def get_cls_news(date_str: str | None = None) -> ClsNewsDocument:
|
||||
normalized_date = normalize_date(date_str or now_local().date().isoformat())
|
||||
document = load_cls_news(normalized_date)
|
||||
if document is None:
|
||||
return refresh_cls_news(normalized_date)
|
||||
if normalized_date != now_local().date().isoformat():
|
||||
return document
|
||||
updated_at = ensure_local_timezone(datetime.fromisoformat(document.updated_at))
|
||||
if now_local() - updated_at >= CLS_REFRESH_INTERVAL:
|
||||
return refresh_cls_news(normalized_date)
|
||||
return document
|
||||
|
||||
|
||||
def seed_demo_content() -> None:
|
||||
save_accounts(ACCOUNTS)
|
||||
|
||||
today = now_local().date()
|
||||
for offset, account_links in SAMPLE_INPUTS.items():
|
||||
date_str = (today - timedelta(days=offset)).isoformat()
|
||||
if fetch_daily_input_document(date_str) is not None and fetch_report_document(date_str) is not None:
|
||||
continue
|
||||
|
||||
payload = DailyInputUpsertPayload(
|
||||
accounts=[
|
||||
{"account_id": account.id, "links": account_links.get(account.id, [])}
|
||||
for account in ACCOUNTS
|
||||
]
|
||||
)
|
||||
input_document = normalize_daily_input(date_str, payload)
|
||||
save_daily_input_document(input_document)
|
||||
save_report_document(generate_report(date_str, input_document))
|
||||
|
||||
today_str = today.isoformat()
|
||||
if fetch_cls_news_document(today_str) is None:
|
||||
save_cls_news_document(build_cls_news_document())
|
||||
Reference in New Issue
Block a user