from __future__ import annotations import json import re from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen ROOT = Path(__file__).resolve().parents[1] OUTPUT_PATH = ROOT / "backend" / "data" / "source_probe_results.json" DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0", "Accept-Language": "zh-CN,zh;q=0.9", } @dataclass class ProbeResult: source: str page_url: str fetched: bool status_code: int | None = None title: str | None = None latest_date: str | None = None direct_table_access: bool = False pagination_access: bool = False realtime_hint: bool = False notes: list[str] = field(default_factory=list) error: str | None = None extracted: dict[str, Any] = field(default_factory=dict) def fetch(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]: headers = dict(DEFAULT_HEADERS) if referer: headers["Referer"] = referer request = Request(url, headers=headers) with urlopen(request, timeout=timeout) as response: content_type = response.headers.get_content_charset() or "utf-8" raw = response.read() try: html = raw.decode(content_type, "ignore") except LookupError: html = raw.decode("utf-8", "ignore") return response.status, html def fetch_gbk(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]: headers = dict(DEFAULT_HEADERS) if referer: headers["Referer"] = referer request = Request(url, headers=headers) with urlopen(request, timeout=timeout) as response: return response.status, response.read().decode("gbk", "ignore") def extract_title(html: str) -> str | None: match = re.search(r"(.*?)", html, re.S | re.I) if not match: return None return re.sub(r"\s+", " ", match.group(1)).strip() def extract_first_date(html: str) -> str | None: match = re.search(r"(20\d{2}-\d{2}-\d{2})", html) if match: return match.group(1) match = re.search(r"(20\d{2}-\d{2}-\d{2})", html) return match.group(1) if match else None def extract_page_info(html: str) -> str | None: match = re.search(r'([^<]+)', html) return match.group(1).strip() if match else None def probe_eastmoney() -> ProbeResult: result = ProbeResult( source="eastmoney", page_url="https://data.eastmoney.com/hsgtV2/hsgtDetail/scgkDetail_nx.html", fetched=False, ) try: status, html = fetch(result.page_url, referer="https://data.eastmoney.com/") result.fetched = True result.status_code = status result.title = extract_title(html) date_match = re.search(r"更新时间[:: ]*]+>\s*<[^>]*>(20\d{2}-\d{2}-\d{2})", html) fallback_date = re.search(r"(20\d{2}-\d{2}-\d{2})", html) result.latest_date = date_match.group(1) if date_match else (fallback_date.group(1) if fallback_date else None) result.direct_table_access = "南向资金" in html and "港股通(沪)" in html and "港股通(深)" in html result.pagination_access = "南向历史" in html or "历史数据" in html result.realtime_hint = "成交净买额(当日)" in html or "实时" in html result.notes.extend( [ "公开页面可访问。", "页面文案包含南向资金、港股通(沪)、港股通(深)。", ] ) if result.latest_date: result.notes.append(f"页面中检出日期 {result.latest_date}。") result.extracted = { "contains_southbound": "南向资金" in html, "contains_shanghai": "港股通(沪)" in html, "contains_shenzhen": "港股通(深)" in html, "contains_today_net_buy": "成交净买额(当日)" in html, } except (HTTPError, URLError, TimeoutError) as exc: result.error = f"{type(exc).__name__}: {exc}" return result def probe_zhitong() -> ProbeResult: url = "https://www.zhitongcaijing.com/content/detail/1295067.html" result = ProbeResult(source="zhitong", page_url=url, fetched=False) try: status, html = fetch(url, referer="https://www.zhitongcaijing.com/") result.fetched = True result.status_code = status result.title = extract_title(html) date_match = re.search(r"(20\d{2}-\d{2}-\d{2})\s+\d{2}:\d{2}:\d{2}", html) result.latest_date = date_match.group(1) if date_match else extract_first_date(html) result.direct_table_access = False result.pagination_access = False result.realtime_hint = False t2 = "T+2" in html or "T+2" in html delayed = "延迟数据" in html or "T+2日结算" in html result.notes.extend( [ "站点可访问,但当前命中的是资讯文章页。", "页面语义更偏新闻/统计解读,不是可直接分页拉取的标准数据表。", ] ) if t2 or delayed: result.notes.append("页面明确指向 T+2 或延迟数据。") result.extracted = { "contains_t_plus_2": t2, "contains_delay_notice": delayed, "contains_southbound": "南向资金" in html, } except (HTTPError, URLError, TimeoutError) as exc: result.error = f"{type(exc).__name__}: {exc}" return result def probe_wind() -> ProbeResult: url = "https://www.wind.com.cn/portal/zh/WFT/index.html" result = ProbeResult(source="wind", page_url=url, fetched=False) try: status, html = fetch(url, referer="https://www.wind.com.cn/") result.fetched = True result.status_code = status result.title = extract_title(html) result.latest_date = extract_first_date(html) result.direct_table_access = False result.pagination_access = False result.realtime_hint = "API" in html or "Client API" in html result.notes.extend( [ "官方产品页可访问。", "当前拿到的是产品介绍页,不是公开南向资金网页数据表。", ] ) if result.realtime_hint: result.notes.append("页面包含 API/客户端能力描述,说明数据更可能通过授权终端或接口获取。") result.extracted = { "contains_client_api": "Client API" in html, "contains_excel_plugin": "Excel" in html, "contains_terminal": "金融终端" in html, } except (HTTPError, URLError, TimeoutError) as exc: result.error = f"{type(exc).__name__}: {exc}" return result def probe_ths_reference() -> ProbeResult: result = ProbeResult( source="ths_reference", page_url="https://data.10jqka.com.cn/hgt/ggtb/", fetched=False, ) try: status, html = fetch_gbk(result.page_url, referer="https://data.10jqka.com.cn/") status_page_2, html_page_2 = fetch_gbk( "https://data.10jqka.com.cn/hgt/ggtb/board/getGgtPage/page/2/", referer=result.page_url, ) result.fetched = True result.status_code = status result.title = extract_title(html) result.latest_date = extract_first_date(html) result.direct_table_access = "" in html result.pagination_access = "
" in html_page_2 and status_page_2 == 200 result.realtime_hint = False result.notes.extend( [ "同花顺公开 HTML 历史表可抓取。", "分页可通过非 ajax=1 直达 URL 访问。", ] ) result.extracted = { "page_info": extract_page_info(html), "page_2_first_date": extract_first_date(html_page_2), "latest_date": result.latest_date, } except (HTTPError, URLError, TimeoutError) as exc: result.error = f"{type(exc).__name__}: {exc}" return result def main() -> None: results = [ probe_eastmoney(), probe_zhitong(), probe_wind(), probe_ths_reference(), ] payload = { "generated_at_utc": datetime.now(timezone.utc).isoformat(), "results": [asdict(item) for item in results], } OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) OUTPUT_PATH.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") print(str(OUTPUT_PATH)) if __name__ == "__main__": main()