from __future__ import annotations import json import re from collections import Counter from datetime import datetime, time, timedelta from typing import Iterable from urllib.parse import unquote, urlparse from zoneinfo import ZoneInfo import requests from bs4 import BeautifulSoup from app.models import ( Account, ClsNewsDocument, ClsNewsItem, ClsNewsSummary, ClsSectorImpact, DailyInputAccount, DailyInputDocument, DailyInputUpsertPayload, OpinionArticle, ReportDocument, ReportListItem, ) from app.services.storage import ( fetch_accounts, fetch_cls_news_document, fetch_daily_input_document, fetch_report_document, fetch_report_list, save_accounts, save_cls_news_document, save_daily_input_document, save_report_document, ) SHANGHAI = ZoneInfo("Asia/Shanghai") CLS_REFRESH_INTERVAL = timedelta(minutes=3) CLS_TELEGRAPH_URL = "https://m.cls.cn/telegraph" HTTP_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36" ), "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", } SENTIMENT_BULL = "\u770b\u591a" SENTIMENT_BEAR = "\u770b\u7a7a" SENTIMENT_NEUTRAL = "\u4e2d\u6027" ACCOUNTS: list[Account] = [ Account( id="touzi-mingjian", name="\u6295\u8d44\u660e\u89c1", description="\u504f\u4e3b\u9898\u8f6e\u52a8\u4e0e\u4e3b\u7ebf\u5224\u65ad\uff0c\u9002\u5408\u8ddf\u8e2a\u5e02\u573a\u504f\u597d\u53d8\u5316\u3002", ), Account( id="aigujun-2020", name="\u7231\u80a1\u541b2020", description="\u5173\u6ce8\u60c5\u7eea\u3001\u70ed\u70b9\u6269\u6563\u4e0e\u4ea4\u6613\u7ec6\u8282\u3002", ), Account( id="mazhiming-shouping", name="\u9a6c\u5fd7\u660e\u6536\u8bc4", description="\u65e5\u5185\u6536\u8bc4\u4e0e\u60c5\u7eea\u53d8\u5316\u603b\u7ed3\u3002", ), Account( id="laobai-guandian", name="\u8001\u767d\u5206\u6790\u5ba4\u89c2\u70b9", description="\u504f\u7b56\u7565\u62c6\u89e3\u548c\u5173\u952e\u677f\u5757\u8ddf\u8e2a\u3002", ), ] ACCOUNT_FOCUS = { "touzi-mingjian": ["AI", "\u7b97\u529b", "\u673a\u5668\u4eba"], "aigujun-2020": ["CPO", "\u5b58\u50a8\u82af\u7247", "\u65b0\u80fd\u6e90"], "mazhiming-shouping": ["AI", "\u5238\u5546", "\u6c7d\u8f66"], "laobai-guandian": ["\u673a\u5668\u4eba", "\u534a\u5bfc\u4f53", "\u65b0\u80fd\u6e90"], } SECTOR_KEYWORDS = { "AI": ["ai", "\u4eba\u5de5\u667a\u80fd", "\u5927\u6a21\u578b", "\u6a21\u578b"], "\u7b97\u529b": ["\u7b97\u529b", "compute", "server", "gpu"], "CPO": ["cpo", "\u5149\u6a21\u5757", "\u9ad8\u901f\u4e92\u8054"], "\u5b58\u50a8\u82af\u7247": ["\u5b58\u50a8", "memory", "dram", "nand"], "\u534a\u5bfc\u4f53": ["\u534a\u5bfc\u4f53", "chip", "wafer", "\u6676\u5706"], "\u5238\u5546": ["\u5238\u5546", "broker", "\u8bc1\u5238"], "\u77f3\u6cb9\u5929\u7136\u6c14": ["\u77f3\u6cb9", "\u5929\u7136\u6c14", "\u6cb9\u6c14", "\u80fd\u6e90\u4ef7\u683c"], "\u65b0\u80fd\u6e90": ["\u65b0\u80fd\u6e90", "\u9502\u7535", "\u5149\u4f0f", "\u50a8\u80fd"], "\u519b\u5de5": ["\u519b\u5de5", "\u536b\u661f", "\u822a\u5929"], "\u673a\u5668\u4eba": ["\u673a\u5668\u4eba", "robot", "\u81ea\u52a8\u5316"], "\u6c7d\u8f66": ["\u6c7d\u8f66", "\u8f66\u4f01", "\u667a\u9a7e", "\u6574\u8f66"], "\u533b\u836f": ["\u533b\u836f", "\u521b\u65b0\u836f", "\u533b\u7597"], } POSITIVE_KEYWORDS = [ "\u673a\u4f1a", "\u4fee\u590d", "\u589e\u5f3a", "\u4e3b\u7ebf", "\u589e\u91cf", "\u53cd\u5f39", "\u7a81\u7834", "\u79ef\u6781", "up", "bull", ] NEGATIVE_KEYWORDS = [ "\u98ce\u9669", "\u627f\u538b", "\u8c28\u614e", "\u56de\u8c03", "\u7f29\u91cf", "\u89c2\u671b", "\u5206\u6b67", "bear", "down", ] ARTICLE_TYPE_PATTERNS = [ ("\u6536\u8bc4", "\u5e02\u573a\u6536\u8bc4"), ("\u5348", "\u76d8\u4e2d\u89c2\u5bdf"), ("\u7b56\u7565", "\u7b56\u7565\u8ddf\u8e2a"), ("\u590d\u76d8", "\u76d8\u9762\u590d\u76d8"), ("\u884c\u4e1a", "\u884c\u4e1a\u89c2\u5bdf"), ] CLS_NEWS_TEMPLATES = [ { "title": "\u8d22\u8054\u793e\u76d8\u524d\u7cbe\u9009\uff1a\u7b97\u529b\u94fe\u56de\u6696\uff0c\u8d44\u91d1\u91cd\u65b0\u805a\u7126\u9ad8\u666f\u6c14\u65b9\u5411", "summary": "\u9694\u591c\u5e02\u573a\u98ce\u9669\u504f\u597d\u56de\u5347\uff0c\u7b97\u529b\u4e0e\u670d\u52a1\u5668\u94fe\u6761\u83b7\u8d44\u91d1\u91cd\u65b0\u914d\u7f6e\uff0c\u60c5\u7eea\u4fee\u590d\u5148\u4e8e\u6210\u4ea4\u5168\u9762\u653e\u5927\u3002", "sectors": ["\u7b97\u529b", "AI"], "sentiment": SENTIMENT_BULL, "reference_url": "https://www.cls.cn/detail/compute-rebound", }, { "title": "AI Daily\uff1aCPO \u4e0e\u5b58\u50a8\u82af\u7247\u540c\u6b65\u8d70\u5f3a\uff0c\u666f\u6c14\u5ea6\u7ebf\u7d22\u5ef6\u7eed", "summary": "\u9ad8\u901f\u4e92\u8054\u4e0e\u5b58\u50a8\u62a5\u4ef7\u9884\u671f\u652f\u6491\u677f\u5757\u8868\u73b0\uff0c\u8d44\u91d1\u66f4\u503e\u5411\u4e8e\u56f4\u7ed5\u786e\u5b9a\u6027\u73af\u8282\u96c6\u4e2d\u3002", "sectors": ["CPO", "\u5b58\u50a8\u82af\u7247"], "sentiment": SENTIMENT_BULL, "reference_url": "https://www.cls.cn/detail/ai-daily-cpo-memory", }, { "title": "\u8d22\u8054\u793e\u884c\u4e1a\u89c2\u5bdf\uff1a\u673a\u5668\u4eba\u94fe\u6761\u5206\u5316\uff0c\u8ba2\u5355\u5151\u73b0\u6210\u4e3a\u77ed\u671f\u7126\u70b9", "summary": "\u673a\u5668\u4eba\u65b9\u5411\u5185\u90e8\u5f00\u59cb\u51fa\u73b0\u5151\u73b0\u4e0e\u6362\u624b\uff0c\u5e02\u573a\u4ece\u6982\u5ff5\u6269\u6563\u8f6c\u5411\u4e1a\u7ee9\u4e0e\u8ba2\u5355\u9a8c\u8bc1\u3002", "sectors": ["\u673a\u5668\u4eba"], "sentiment": SENTIMENT_NEUTRAL, "reference_url": "https://www.cls.cn/detail/robotics-orders", }, { "title": "\u8d22\u8054\u793e\u80fd\u6e90\u8ffd\u8e2a\uff1a\u6cb9\u6c14\u677f\u5757\u9ad8\u4f4d\u9707\u8361\uff0c\u8d44\u91d1\u5207\u5411\u9632\u5fa1\u54c1\u79cd", "summary": "\u539f\u6cb9\u4ef7\u683c\u7ef4\u6301\u9ad8\u4f4d\u540e\uff0c\u6cb9\u6c14\u65b9\u5411\u51fa\u73b0\u9ad8\u4f4d\u9707\u8361\uff0c\u90e8\u5206\u8d44\u91d1\u8f6c\u5411\u533b\u836f\u7b49\u9632\u5b88\u677f\u5757\u3002", "sectors": ["\u77f3\u6cb9\u5929\u7136\u6c14", "\u533b\u836f"], "sentiment": SENTIMENT_NEUTRAL, "reference_url": "https://www.cls.cn/detail/energy-rotation", }, { "title": "AI Daily\uff1a\u6c7d\u8f66\u4e0e\u667a\u9a7e\u5ef6\u7eed\u5206\u6b67\uff0c\u4e3b\u7ebf\u4ecd\u9700\u7b49\u5f85\u9500\u91cf\u6570\u636e\u9a8c\u8bc1", "summary": "\u6574\u8f66\u4e0e\u667a\u9a7e\u65b9\u5411\u5173\u6ce8\u5ea6\u4ecd\u9ad8\uff0c\u4f46\u5e02\u573a\u5bf9\u4f30\u503c\u6269\u5f20\u5df2\u6709\u4fdd\u7559\uff0c\u7b49\u5f85\u9500\u91cf\u548c\u8ba2\u5355\u6570\u636e\u786e\u8ba4\u3002", "sectors": ["\u6c7d\u8f66"], "sentiment": SENTIMENT_BEAR, "reference_url": "https://www.cls.cn/detail/auto-data-watch", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u534a\u5bfc\u4f53\u8bbe\u5907\u65b9\u5411\u8d70\u5f3a\uff0c\u673a\u6784\u79f0\u56fd\u4ea7\u66ff\u4ee3\u8282\u594f\u63d0\u901f", "summary": "\u6676\u5706\u5236\u9020\u4e0e\u8bbe\u5907\u94fe\u6761\u51fa\u73b0\u5f02\u52a8\uff0c\u5e02\u573a\u56f4\u7ed5\u56fd\u4ea7\u66ff\u4ee3\u548c\u8d44\u672c\u5f00\u652f\u6062\u590d\u91cd\u65b0\u5b9a\u4ef7\u3002", "sectors": ["\u534a\u5bfc\u4f53"], "sentiment": SENTIMENT_BULL, "reference_url": "https://www.cls.cn/detail/semi-equipment-up", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u5238\u5546\u677f\u5757\u5348\u540e\u62c9\u5347\uff0c\u5e02\u573a\u60c5\u7eea\u6709\u6240\u4fee\u590d", "summary": "\u6307\u6570\u9707\u8361\u8fc7\u7a0b\u4e2d\u5238\u5546\u627f\u62c5\u60c5\u7eea\u4fee\u590d\u529f\u80fd\uff0c\u5e26\u52a8\u90e8\u5206\u9ad8\u5f39\u6027\u65b9\u5411\u56de\u6696\u3002", "sectors": ["\u5238\u5546"], "sentiment": SENTIMENT_BULL, "reference_url": "https://www.cls.cn/detail/broker-rebound", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u521b\u65b0\u836f\u65b9\u5411\u6301\u7eed\u6d3b\u8dc3\uff0c\u8d44\u91d1\u8f6c\u5411\u9632\u5b88\u4e0e\u6210\u957f\u517c\u987e", "summary": "\u533b\u836f\u677f\u5757\u83b7\u5f97\u589e\u91cf\u8d44\u91d1\u5173\u6ce8\uff0c\u521b\u65b0\u836f\u548c\u5668\u68b0\u7ec6\u5206\u8868\u73b0\u66f4\u5f3a\u3002", "sectors": ["\u533b\u836f"], "sentiment": SENTIMENT_NEUTRAL, "reference_url": "https://www.cls.cn/detail/medical-active", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u65b0\u80fd\u6e90\u94fe\u6761\u5206\u5316\u52a0\u5267\uff0c\u673a\u6784\u63d0\u9192\u5173\u6ce8\u4ea7\u80fd\u51fa\u6e05\u8282\u594f", "summary": "\u65b0\u80fd\u6e90\u677f\u5757\u5185\u90e8\u8f6e\u52a8\u660e\u663e\uff0c\u8d44\u91d1\u66f4\u504f\u5411\u4f4e\u4f4d\u73af\u8282\u548c\u6210\u672c\u6539\u5584\u65b9\u5411\u3002", "sectors": ["\u65b0\u80fd\u6e90"], "sentiment": SENTIMENT_NEUTRAL, "reference_url": "https://www.cls.cn/detail/new-energy-split", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u519b\u5de5\u677f\u5757\u76d8\u4e2d\u5f02\u52a8\uff0c\u8ba2\u5355\u5151\u73b0\u9884\u671f\u91cd\u65b0\u5347\u6e29", "summary": "\u519b\u5de5\u94fe\u6761\u76d8\u4e2d\u8d70\u5f3a\uff0c\u5e02\u573a\u5173\u6ce8\u540e\u7eed\u8ba2\u5355\u5151\u73b0\u4e0e\u4f30\u503c\u5207\u6362\u7a7a\u95f4\u3002", "sectors": ["\u519b\u5de5"], "sentiment": SENTIMENT_BULL, "reference_url": "https://www.cls.cn/detail/defense-orders", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u673a\u5668\u4eba\u677f\u5757\u51b2\u9ad8\u56de\u843d\uff0c\u77ed\u7ebf\u535a\u5f08\u60c5\u7eea\u5347\u6e29", "summary": "\u673a\u5668\u4eba\u65b9\u5411\u9ad8\u4f4d\u9707\u8361\uff0c\u8d44\u91d1\u5728\u9898\u6750\u6269\u6563\u4e0e\u5151\u73b0\u538b\u529b\u4e4b\u95f4\u53cd\u590d\u5207\u6362\u3002", "sectors": ["\u673a\u5668\u4eba"], "sentiment": SENTIMENT_NEUTRAL, "reference_url": "https://www.cls.cn/detail/robotics-intraday", }, { "title": "\u8d22\u8054\u793e7x24\uff1a\u5b58\u50a8\u82af\u7247\u62a5\u4ef7\u9884\u671f\u7ee7\u7eed\u4e0a\u4fee\uff0c\u4ea7\u4e1a\u94fe\u666f\u6c14\u5ea6\u53d7\u5173\u6ce8", "summary": "\u5b58\u50a8\u73af\u8282\u4ef7\u683c\u4fee\u590d\u903b\u8f91\u5ef6\u7eed\uff0c\u5e02\u573a\u91cd\u65b0\u4ea4\u6613\u4f9b\u9700\u6539\u5584\u4e0e\u76c8\u5229\u5f39\u6027\u3002", "sectors": ["\u5b58\u50a8\u82af\u7247"], "sentiment": SENTIMENT_BULL, "reference_url": "https://www.cls.cn/detail/memory-price-up", }, ] SAMPLE_INPUTS = { 1: { "touzi-mingjian": ["https://mp.weixin.qq.com/s/semiconductor-capacity-and-chip-cycle"], "aigujun-2020": ["https://mp.weixin.qq.com/s/storage-chip-price-repair"], "mazhiming-shouping": ["https://mp.weixin.qq.com/s/market-close-sector-rotation"], "laobai-guandian": ["https://mp.weixin.qq.com/s/robotics-and-energy-balance"], }, } def now_local() -> datetime: return datetime.now(SHANGHAI) def iso_timestamp(value: datetime | None = None) -> str: return (value or now_local()).replace(microsecond=0).isoformat() def ensure_local_timezone(value: datetime) -> datetime: if value.tzinfo is None: return value.replace(tzinfo=SHANGHAI) return value.astimezone(SHANGHAI) def normalize_whitespace(value: str) -> str: return re.sub(r"\s+", " ", value).strip() def extract_json_object(script_text: str, marker: str) -> str: marker_index = script_text.find(marker) if marker_index < 0: raise RuntimeError(f"Marker not found: {marker}") start = script_text.find("{", marker_index) if start < 0: raise RuntimeError(f"JSON object start not found for marker: {marker}") depth = 0 in_string = False escaped = False for index in range(start, len(script_text)): char = script_text[index] if in_string: if escaped: escaped = False elif char == "\\": escaped = True elif char == '"': in_string = False continue if char == '"': in_string = True continue if char == "{": depth += 1 continue if char == "}": depth -= 1 if depth == 0: return script_text[start : index + 1] raise RuntimeError(f"JSON object end not found for marker: {marker}") def parse_telegraph_timestamp(date_str: str, time_str: str) -> str: normalized_time = time_str if len(time_str.split(":")) == 3 else f"{time_str}:00" return datetime.fromisoformat(f"{date_str}T{normalized_time}").replace(tzinfo=SHANGHAI).isoformat(timespec="seconds") def split_title_and_summary(content: str) -> tuple[str, str]: cleaned = normalize_whitespace(content) bracket_match = re.match(r"^[\[({\u3010\u3016](.+?)[\])}\u3011\u3017][\uff1a: ]*(.*)$", cleaned) if bracket_match: title = normalize_whitespace(bracket_match.group(1)) summary = normalize_whitespace(bracket_match.group(2) or cleaned) return title[:80], summary or title sentence_parts = re.split(r"[。;;!?!?]", cleaned, maxsplit=1) title = sentence_parts[0][:80] summary = cleaned if len(cleaned) <= 220 else f"{cleaned[:217]}..." return title, summary def build_fallback_cls_items(reference_time: datetime) -> list[ClsNewsItem]: items: list[ClsNewsItem] = [] for index, template in enumerate(CLS_NEWS_TEMPLATES): published_at = (reference_time - timedelta(minutes=index * 95 + 8)).replace(microsecond=0).isoformat() items.append( ClsNewsItem( id=f"cls-{index + 1}", title=template["title"], published_at=published_at, source="\u8d22\u8054\u793e" if index % 2 == 0 else "\u8d22\u8054\u793e AI Daily", summary=template["summary"], reference_url=template["reference_url"], sectors=template["sectors"], sentiment=template["sentiment"], ) ) return sorted(items, key=lambda item: item.published_at, reverse=True) def fetch_cls_telegraph_items(reference_time: datetime) -> list[ClsNewsItem]: session = requests.Session() session.trust_env = False response = session.get(CLS_TELEGRAPH_URL, headers=HTTP_HEADERS, timeout=15) response.raise_for_status() response.encoding = "utf-8" soup = BeautifulSoup(response.text, "html.parser") next_data_script = None for script in soup.find_all("script"): script_text = script.string or script.get_text() if "__NEXT_DATA__ =" in script_text: next_data_script = script_text break if not next_data_script: raise RuntimeError("Missing __NEXT_DATA__ payload on cls.cn") next_data = json.loads(extract_json_object(next_data_script, "__NEXT_DATA__ =")) roll_data = ( next_data.get("props", {}) .get("initialState", {}) .get("roll_data", []) ) if not isinstance(roll_data, list) or not roll_data: raise RuntimeError("Missing roll_data in cls.cn payload") target_date = reference_time.date() items: list[ClsNewsItem] = [] seen_ids: set[int] = set() latest_limit = 80 for entry in roll_data: if len(items) >= latest_limit: break item_id = int(entry.get("id") or 0) if not item_id or item_id in seen_ids: continue seen_ids.add(item_id) timestamp = int(entry.get("modified_time") or entry.get("ctime") or 0) if not timestamp: continue published_dt = datetime.fromtimestamp(timestamp, tz=SHANGHAI) if published_dt.date() != target_date: continue raw_content = normalize_whitespace( entry.get("content") or entry.get("brief") or entry.get("title") or "" ) if len(raw_content) < 8: continue title = normalize_whitespace(entry.get("title") or "") if not title: title, _ = split_title_and_summary(raw_content) summary = normalize_whitespace(entry.get("brief") or "") if not summary: _, summary = split_title_and_summary(raw_content) source = normalize_whitespace(entry.get("author") or "\u8d22\u8054\u793e7x24") reference_url = normalize_whitespace(entry.get("shareurl") or "") if not reference_url: reference_url = f"https://www.cls.cn/detail/{item_id}" sectors = infer_sectors(f"{title} {summary}", "touzi-mingjian") sentiment = infer_sentiment(f"{title} {summary}") items.append( ClsNewsItem( id=f"cls-live-{item_id}", title=title[:120], published_at=published_dt.isoformat(timespec="seconds"), source=source, summary=summary[:500], reference_url=reference_url, sectors=sectors, sentiment=sentiment, ) ) if not items: raise RuntimeError("No telegraph items parsed from cls.cn") return sorted(items, key=lambda item: item.published_at, reverse=True) def get_accounts() -> list[Account]: records = fetch_accounts() return records or ACCOUNTS def normalize_date(value: str) -> str: return datetime.fromisoformat(value).date().isoformat() def blank_daily_input(date_str: str) -> DailyInputDocument: return DailyInputDocument( date=date_str, updated_at=iso_timestamp(), accounts=[ DailyInputAccount(account_id=account.id, account_name=account.name, links=[]) for account in get_accounts() ], ) def clean_links(links: Iterable[str]) -> list[str]: normalized: list[str] = [] seen: set[str] = set() for raw_link in links: link = raw_link.strip() if not link or link in seen: continue seen.add(link) normalized.append(link) return normalized def normalize_daily_input(date_str: str, payload: DailyInputUpsertPayload) -> DailyInputDocument: payload_map = {item.account_id: clean_links(item.links) for item in payload.accounts} return DailyInputDocument( date=date_str, updated_at=iso_timestamp(), accounts=[ DailyInputAccount( account_id=account.id, account_name=account.name, links=payload_map.get(account.id, []), ) for account in get_accounts() ], ) def load_daily_input(date_str: str) -> DailyInputDocument: payload = fetch_daily_input_document(date_str) if payload is None: return blank_daily_input(date_str) return payload def save_daily_input(document: DailyInputDocument) -> DailyInputDocument: return save_daily_input_document(document) def load_report(date_str: str) -> ReportDocument | None: return fetch_report_document(date_str) def save_report(document: ReportDocument) -> ReportDocument: return save_report_document(document) def list_reports() -> list[ReportListItem]: return fetch_report_list() def title_from_link(account_name: str, url: str, index: int) -> str: text = unquote(urlparse(url).path or url) tokens = [ token for token in re.split(r"[\W_]+", text.lower()) if token and token not in {"s", "mp", "weixin", "qq", "com"} ] meaningful = [token for token in tokens if len(token) > 1] if meaningful: topic = " / ".join(token.upper() if len(token) <= 3 else token.capitalize() for token in meaningful[:3]) return f"{account_name}\uff1a{topic} \u89c2\u5bdf" return f"{account_name}\uff1a\u5e02\u573a\u8ddf\u8e2a\u7b2c {index + 1} \u6761" def infer_sectors(text: str, account_id: str) -> list[str]: lowered = text.lower() sectors = [ sector for sector, keywords in SECTOR_KEYWORDS.items() if any(keyword.lower() in lowered for keyword in keywords) ] if sectors: return sectors[:3] return ACCOUNT_FOCUS.get(account_id, ["AI", "\u7b97\u529b"])[:2] def infer_sentiment(text: str) -> str: lowered = text.lower() positive = sum(keyword.lower() in lowered for keyword in POSITIVE_KEYWORDS) negative = sum(keyword.lower() in lowered for keyword in NEGATIVE_KEYWORDS) if positive > negative: return SENTIMENT_BULL if negative > positive: return SENTIMENT_BEAR return SENTIMENT_NEUTRAL def infer_article_type(title: str) -> str: lowered = title.lower() for keyword, article_type in ARTICLE_TYPE_PATTERNS: if keyword.lower() in lowered: return article_type return "\u4e3b\u9898\u89c2\u70b9" def build_article_summary(title: str, sectors: list[str], sentiment: str) -> str: sector_text = "\u3001".join(sectors[:2]) if sectors else "\u6838\u5fc3\u4e3b\u7ebf" sentiment_text = { SENTIMENT_BULL: "\u504f\u79ef\u6781\u7684\u8282\u594f\u5224\u65ad", SENTIMENT_BEAR: "\u660e\u663e\u504f\u8c28\u614e\u7684\u98ce\u9669\u63d0\u9192", SENTIMENT_NEUTRAL: "\u66f4\u5f3a\u8c03\u7ed3\u6784\u5206\u5316\u4e0e\u7b49\u5f85\u786e\u8ba4", }[sentiment] return f"{title} \u56f4\u7ed5 {sector_text} \u5c55\u5f00\uff0c\u7ed9\u51fa\u7684\u7ed3\u8bba\u662f{sentiment_text}\uff0c\u9002\u5408\u4f5c\u4e3a\u5f53\u65e5\u76d8\u9762\u8ddf\u8e2a\u4e0e\u590d\u76d8\u53c2\u8003\u3002" def generate_report(date_str: str, input_document: DailyInputDocument) -> ReportDocument: base_date = datetime.fromisoformat(date_str) articles: list[OpinionArticle] = [] for account_index, account in enumerate(input_document.accounts): for link_index, url in enumerate(account.links): title = title_from_link(account.account_name, url, link_index) sectors = infer_sectors(f"{title} {url}", account.account_id) sentiment = infer_sentiment(f"{title} {url}") published_at = ( base_date.replace(hour=9 + ((account_index + link_index) % 8), minute=(link_index * 12) % 60) .replace(tzinfo=SHANGHAI) .isoformat(timespec="seconds") ) articles.append( OpinionArticle( id=f"{date_str}-{account.account_id}-{link_index}", account_id=account.account_id, account_name=account.account_name, title=title, published_at=published_at, summary=build_article_summary(title, sectors, sentiment), source_url=url, sectors=sectors, sentiment=sentiment, article_type=infer_article_type(title), ) ) if not articles: return ReportDocument( date=date_str, generated_at=iso_timestamp(), summary="\u5f53\u65e5\u5c1a\u672a\u5f55\u5165\u6587\u7ae0\u94fe\u63a5\uff0c\u7cfb\u7edf\u5df2\u4fdd\u7559\u65e5\u62a5\u7ed3\u6784\uff0c\u7b49\u5f85\u8865\u5145\u516c\u4f17\u53f7\u6587\u7ae0\u540e\u518d\u751f\u6210\u5b8c\u6574\u7ed3\u8bba\u3002", focus_sectors=[], article_count=0, account_count=0, articles=[], ) sector_counter = Counter(sector for article in articles for sector in article.sectors) focus_sectors = [sector for sector, _count in sector_counter.most_common(4)] sentiment_counter = Counter(article.sentiment for article in articles) if sentiment_counter[SENTIMENT_BULL] > sentiment_counter[SENTIMENT_BEAR]: tone = "\u6574\u4f53\u504f\u79ef\u6781\uff0c\u4e3b\u7ebf\u8ba8\u8bba\u96c6\u4e2d\u5ea6\u8f83\u9ad8" elif sentiment_counter[SENTIMENT_BEAR] > sentiment_counter[SENTIMENT_BULL]: tone = "\u6574\u4f53\u504f\u8c28\u614e\uff0c\u98ce\u9669\u63a7\u5236\u4ecd\u662f\u4e3b\u53d9\u4e8b" else: tone = "\u591a\u7a7a\u5206\u6b67\u5e76\u5b58\uff0c\u5e02\u573a\u66f4\u770b\u91cd\u9a8c\u8bc1\u4e0e\u8282\u594f" active_accounts = len([account for account in input_document.accounts if account.links]) sector_text = "\u3001".join(focus_sectors) if focus_sectors else "\u6682\u65e0\u805a\u7126\u677f\u5757" summary = ( f"{date_str} \u5171\u6574\u7406 {len(articles)} \u7bc7\u516c\u4f17\u53f7\u89c2\u70b9\uff0c\u8986\u76d6 {active_accounts} \u4e2a\u8d26\u6237\u3002" f"{tone}\uff0c\u8ba8\u8bba\u91cd\u70b9\u843d\u5728 {sector_text}\u3002" ) return ReportDocument( date=date_str, generated_at=iso_timestamp(), summary=summary, focus_sectors=focus_sectors, article_count=len(articles), account_count=active_accounts, articles=sorted(articles, key=lambda item: item.published_at, reverse=True), ) def build_cls_news_document( reference_time: datetime | None = None, *, allow_live_fetch: bool = True, ) -> ClsNewsDocument: current = reference_time or now_local() try: if allow_live_fetch: items = fetch_cls_telegraph_items(current) else: raise RuntimeError("Live fetch disabled for non-current date") except Exception: items = build_fallback_cls_items(current) sector_counter = Counter(sector for item in items for sector in item.sectors) watch_list = [sector for sector, _count in sector_counter.most_common(5)] overview = ( "\u8d44\u8baf\u5217\u8868\u5c55\u793a\u6240\u9009\u65e5\u671f\u5185\u7684\u8d22\u8054\u793e 7x24 \u8d44\u8baf\uff0c" "\u5f53\u65e5\u6570\u636e\u6765\u81ea cls.cn \u5b9e\u65f6\u6293\u53d6\uff0c\u6bcf 3 \u5206\u949f\u66f4\u65b0\u4e00\u6b21\u3002" ) hot_topics = ( "\u70ed\u70b9\u6982\u89c8\u53ea\u4fdd\u7559\u5bf9\u677f\u5757\u5b58\u5728\u660e\u663e\u5f71\u54cd\u7684\u65b9\u5411\uff0c" f"\u5f53\u524d\u4e3b\u8981\u96c6\u4e2d\u5728 {'\u3001'.join(watch_list[:3])}\u3002" ) sector_impacts: list[ClsSectorImpact] = [] seen_sectors: set[str] = set() for sector in watch_list[:4]: if sector in seen_sectors: continue seen_sectors.add(sector) related_items = [item for item in items if sector in item.sectors] if not related_items: continue sentiment_counter = Counter(item.sentiment for item in related_items) if sentiment_counter[SENTIMENT_BULL] > sentiment_counter[SENTIMENT_BEAR]: sentiment = SENTIMENT_BULL reason = f"{sector} \u65b9\u5411\u51fa\u73b0\u50ac\u5316\u6216\u666f\u6c14\u5f3a\u5316\uff0c\u77ed\u7ebf\u504f\u6b63\u5411\u5f71\u54cd\u3002" elif sentiment_counter[SENTIMENT_BEAR] > sentiment_counter[SENTIMENT_BULL]: sentiment = SENTIMENT_BEAR reason = f"{sector} \u65b9\u5411\u51fa\u73b0\u5151\u73b0\u6216\u5206\u6b67\uff0c\u77ed\u7ebf\u504f\u8d1f\u5411\u5f71\u54cd\u3002" else: sentiment = SENTIMENT_NEUTRAL reason = f"{sector} \u65b9\u5411\u6709\u8ba8\u8bba\u4f46\u4ecd\u9700\u9a8c\u8bc1\uff0c\u77ed\u7ebf\u4ee5\u4e2d\u6027\u89c2\u5bdf\u4e3a\u4e3b\u3002" sector_impacts.append( ClsSectorImpact( sector=sector, sentiment=sentiment, reason=reason, related_titles=list(dict.fromkeys(item.title for item in related_items[:2])), ) ) return ClsNewsDocument( date=current.date().isoformat(), updated_at=iso_timestamp(current), window_label="\u5f53\u5929\u8d44\u8baf", summary=ClsNewsSummary( overview=overview, hot_topics=hot_topics, watch_list=watch_list, ), sector_impacts=sector_impacts, items=items, ) def load_cls_news(date_str: str) -> ClsNewsDocument | None: return fetch_cls_news_document(date_str) def build_reference_time(date_str: str) -> datetime: date_value = datetime.fromisoformat(date_str).date() if date_value == now_local().date(): return now_local() return datetime.combine(date_value, time(hour=15, minute=0), tzinfo=SHANGHAI) def refresh_cls_news(date_str: str | None = None) -> ClsNewsDocument: normalized_date = normalize_date(date_str or now_local().date().isoformat()) existing = load_cls_news(normalized_date) reference_time = build_reference_time(normalized_date) allow_live_fetch = normalized_date == now_local().date().isoformat() try: document = build_cls_news_document(reference_time, allow_live_fetch=allow_live_fetch) except Exception: if existing is not None: return existing raise return save_cls_news_document(document) def get_cls_news(date_str: str | None = None) -> ClsNewsDocument: normalized_date = normalize_date(date_str or now_local().date().isoformat()) document = load_cls_news(normalized_date) if document is None: return refresh_cls_news(normalized_date) if normalized_date != now_local().date().isoformat(): return document updated_at = ensure_local_timezone(datetime.fromisoformat(document.updated_at)) if now_local() - updated_at >= CLS_REFRESH_INTERVAL: return refresh_cls_news(normalized_date) return document def seed_demo_content() -> None: save_accounts(ACCOUNTS) today = now_local().date() for offset, account_links in SAMPLE_INPUTS.items(): date_str = (today - timedelta(days=offset)).isoformat() if fetch_daily_input_document(date_str) is not None and fetch_report_document(date_str) is not None: continue payload = DailyInputUpsertPayload( accounts=[ {"account_id": account.id, "links": account_links.get(account.id, [])} for account in ACCOUNTS ] ) input_document = normalize_daily_input(date_str, payload) save_daily_input_document(input_document) save_report_document(generate_report(date_str, input_document)) today_str = today.isoformat() if fetch_cls_news_document(today_str) is None: save_cls_news_document(build_cls_news_document())