zjjk/tools/source_probe.py

from __future__ import annotations

import json
import re
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen


ROOT = Path(__file__).resolve().parents[1]
OUTPUT_PATH = ROOT / "backend" / "data" / "source_probe_results.json"


DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept-Language": "zh-CN,zh;q=0.9",
}


@dataclass
class ProbeResult:
    source: str
    page_url: str
    fetched: bool
    status_code: int | None = None
    title: str | None = None
    latest_date: str | None = None
    direct_table_access: bool = False
    pagination_access: bool = False
    realtime_hint: bool = False
    notes: list[str] = field(default_factory=list)
    error: str | None = None
    extracted: dict[str, Any] = field(default_factory=dict)


def fetch(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
    headers = dict(DEFAULT_HEADERS)
    if referer:
        headers["Referer"] = referer
    request = Request(url, headers=headers)
    with urlopen(request, timeout=timeout) as response:
        content_type = response.headers.get_content_charset() or "utf-8"
        raw = response.read()
        try:
            html = raw.decode(content_type, "ignore")
        except LookupError:
            html = raw.decode("utf-8", "ignore")
    return response.status, html


def fetch_gbk(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
    headers = dict(DEFAULT_HEADERS)
    if referer:
        headers["Referer"] = referer
    request = Request(url, headers=headers)
    with urlopen(request, timeout=timeout) as response:
        return response.status, response.read().decode("gbk", "ignore")


def extract_title(html: str) -> str | None:
    match = re.search(r"<title>(.*?)</title>", html, re.S | re.I)
    if not match:
        return None
    return re.sub(r"\s+", " ", match.group(1)).strip()


def extract_first_date(html: str) -> str | None:
    match = re.search(r"<td>(20\d{2}-\d{2}-\d{2})</td>", html)
    if match:
        return match.group(1)
    match = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
    return match.group(1) if match else None


def extract_page_info(html: str) -> str | None:
    match = re.search(r'<span class="page_info">([^<]+)</span>', html)
    return match.group(1).strip() if match else None


def probe_eastmoney() -> ProbeResult:
    result = ProbeResult(
        source="eastmoney",
        page_url="https://data.eastmoney.com/hsgtV2/hsgtDetail/scgkDetail_nx.html",
        fetched=False,
    )
    try:
        status, html = fetch(result.page_url, referer="https://data.eastmoney.com/")
        result.fetched = True
        result.status_code = status
        result.title = extract_title(html)
        date_match = re.search(r"更新时间[：: ]*</[^>]+>\s*<[^>]*>(20\d{2}-\d{2}-\d{2})", html)
        fallback_date = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
        result.latest_date = date_match.group(1) if date_match else (fallback_date.group(1) if fallback_date else None)
        result.direct_table_access = "南向资金" in html and "港股通(沪)" in html and "港股通(深)" in html
        result.pagination_access = "南向历史" in html or "历史数据" in html
        result.realtime_hint = "成交净买额(当日)" in html or "实时" in html
        result.notes.extend(
            [
                "公开页面可访问。",
                "页面文案包含南向资金、港股通(沪)、港股通(深)。",
            ]
        )
        if result.latest_date:
            result.notes.append(f"页面中检出日期 {result.latest_date}。")
        result.extracted = {
            "contains_southbound": "南向资金" in html,
            "contains_shanghai": "港股通(沪)" in html,
            "contains_shenzhen": "港股通(深)" in html,
            "contains_today_net_buy": "成交净买额(当日)" in html,
        }
    except (HTTPError, URLError, TimeoutError) as exc:
        result.error = f"{type(exc).__name__}: {exc}"
    return result


def probe_zhitong() -> ProbeResult:
    url = "https://www.zhitongcaijing.com/content/detail/1295067.html"
    result = ProbeResult(source="zhitong", page_url=url, fetched=False)
    try:
        status, html = fetch(url, referer="https://www.zhitongcaijing.com/")
        result.fetched = True
        result.status_code = status
        result.title = extract_title(html)
        date_match = re.search(r"(20\d{2}-\d{2}-\d{2})\s+\d{2}:\d{2}:\d{2}", html)
        result.latest_date = date_match.group(1) if date_match else extract_first_date(html)
        result.direct_table_access = False
        result.pagination_access = False
        result.realtime_hint = False
        t2 = "T+2" in html or "T＋2" in html
        delayed = "延迟数据" in html or "T+2日结算" in html
        result.notes.extend(
            [
                "站点可访问，但当前命中的是资讯文章页。",
                "页面语义更偏新闻/统计解读，不是可直接分页拉取的标准数据表。",
            ]
        )
        if t2 or delayed:
            result.notes.append("页面明确指向 T+2 或延迟数据。")
        result.extracted = {
            "contains_t_plus_2": t2,
            "contains_delay_notice": delayed,
            "contains_southbound": "南向资金" in html,
        }
    except (HTTPError, URLError, TimeoutError) as exc:
        result.error = f"{type(exc).__name__}: {exc}"
    return result


def probe_wind() -> ProbeResult:
    url = "https://www.wind.com.cn/portal/zh/WFT/index.html"
    result = ProbeResult(source="wind", page_url=url, fetched=False)
    try:
        status, html = fetch(url, referer="https://www.wind.com.cn/")
        result.fetched = True
        result.status_code = status
        result.title = extract_title(html)
        result.latest_date = extract_first_date(html)
        result.direct_table_access = False
        result.pagination_access = False
        result.realtime_hint = "API" in html or "Client API" in html
        result.notes.extend(
            [
                "官方产品页可访问。",
                "当前拿到的是产品介绍页，不是公开南向资金网页数据表。",
            ]
        )
        if result.realtime_hint:
            result.notes.append("页面包含 API/客户端能力描述，说明数据更可能通过授权终端或接口获取。")
        result.extracted = {
            "contains_client_api": "Client API" in html,
            "contains_excel_plugin": "Excel" in html,
            "contains_terminal": "金融终端" in html,
        }
    except (HTTPError, URLError, TimeoutError) as exc:
        result.error = f"{type(exc).__name__}: {exc}"
    return result


def probe_ths_reference() -> ProbeResult:
    result = ProbeResult(
        source="ths_reference",
        page_url="https://data.10jqka.com.cn/hgt/ggtb/",
        fetched=False,
    )
    try:
        status, html = fetch_gbk(result.page_url, referer="https://data.10jqka.com.cn/")
        status_page_2, html_page_2 = fetch_gbk(
            "https://data.10jqka.com.cn/hgt/ggtb/board/getGgtPage/page/2/",
            referer=result.page_url,
        )
        result.fetched = True
        result.status_code = status
        result.title = extract_title(html)
        result.latest_date = extract_first_date(html)
        result.direct_table_access = "<table class=\"m-table J-ajax-table\">" in html
        result.pagination_access = "<table class=\"m-table J-ajax-table\">" in html_page_2 and status_page_2 == 200
        result.realtime_hint = False
        result.notes.extend(
            [
                "同花顺公开 HTML 历史表可抓取。",
                "分页可通过非 ajax=1 直达 URL 访问。",
            ]
        )
        result.extracted = {
            "page_info": extract_page_info(html),
            "page_2_first_date": extract_first_date(html_page_2),
            "latest_date": result.latest_date,
        }
    except (HTTPError, URLError, TimeoutError) as exc:
        result.error = f"{type(exc).__name__}: {exc}"
    return result


def main() -> None:
    results = [
        probe_eastmoney(),
        probe_zhitong(),
        probe_wind(),
        probe_ths_reference(),
    ]
    payload = {
        "generated_at_utc": datetime.now(timezone.utc).isoformat(),
        "results": [asdict(item) for item in results],
    }
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    OUTPUT_PATH.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    print(str(OUTPUT_PATH))


if __name__ == "__main__":
    main()