Files
zjjk/tools/source_probe.py

235 lines
8.6 KiB
Python
Raw Normal View History

2026-03-20 21:47:30 +08:00
from __future__ import annotations
import json
import re
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
ROOT = Path(__file__).resolve().parents[1]
OUTPUT_PATH = ROOT / "backend" / "data" / "source_probe_results.json"
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept-Language": "zh-CN,zh;q=0.9",
}
@dataclass
class ProbeResult:
source: str
page_url: str
fetched: bool
status_code: int | None = None
title: str | None = None
latest_date: str | None = None
direct_table_access: bool = False
pagination_access: bool = False
realtime_hint: bool = False
notes: list[str] = field(default_factory=list)
error: str | None = None
extracted: dict[str, Any] = field(default_factory=dict)
def fetch(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
headers = dict(DEFAULT_HEADERS)
if referer:
headers["Referer"] = referer
request = Request(url, headers=headers)
with urlopen(request, timeout=timeout) as response:
content_type = response.headers.get_content_charset() or "utf-8"
raw = response.read()
try:
html = raw.decode(content_type, "ignore")
except LookupError:
html = raw.decode("utf-8", "ignore")
return response.status, html
def fetch_gbk(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
headers = dict(DEFAULT_HEADERS)
if referer:
headers["Referer"] = referer
request = Request(url, headers=headers)
with urlopen(request, timeout=timeout) as response:
return response.status, response.read().decode("gbk", "ignore")
def extract_title(html: str) -> str | None:
match = re.search(r"<title>(.*?)</title>", html, re.S | re.I)
if not match:
return None
return re.sub(r"\s+", " ", match.group(1)).strip()
def extract_first_date(html: str) -> str | None:
match = re.search(r"<td>(20\d{2}-\d{2}-\d{2})</td>", html)
if match:
return match.group(1)
match = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
return match.group(1) if match else None
def extract_page_info(html: str) -> str | None:
match = re.search(r'<span class="page_info">([^<]+)</span>', html)
return match.group(1).strip() if match else None
def probe_eastmoney() -> ProbeResult:
result = ProbeResult(
source="eastmoney",
page_url="https://data.eastmoney.com/hsgtV2/hsgtDetail/scgkDetail_nx.html",
fetched=False,
)
try:
status, html = fetch(result.page_url, referer="https://data.eastmoney.com/")
result.fetched = True
result.status_code = status
result.title = extract_title(html)
date_match = re.search(r"更新时间[: ]*</[^>]+>\s*<[^>]*>(20\d{2}-\d{2}-\d{2})", html)
fallback_date = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
result.latest_date = date_match.group(1) if date_match else (fallback_date.group(1) if fallback_date else None)
result.direct_table_access = "南向资金" in html and "港股通(沪)" in html and "港股通(深)" in html
result.pagination_access = "南向历史" in html or "历史数据" in html
result.realtime_hint = "成交净买额(当日)" in html or "实时" in html
result.notes.extend(
[
"公开页面可访问。",
"页面文案包含南向资金、港股通(沪)、港股通(深)。",
]
)
if result.latest_date:
result.notes.append(f"页面中检出日期 {result.latest_date}")
result.extracted = {
"contains_southbound": "南向资金" in html,
"contains_shanghai": "港股通(沪)" in html,
"contains_shenzhen": "港股通(深)" in html,
"contains_today_net_buy": "成交净买额(当日)" in html,
}
except (HTTPError, URLError, TimeoutError) as exc:
result.error = f"{type(exc).__name__}: {exc}"
return result
def probe_zhitong() -> ProbeResult:
url = "https://www.zhitongcaijing.com/content/detail/1295067.html"
result = ProbeResult(source="zhitong", page_url=url, fetched=False)
try:
status, html = fetch(url, referer="https://www.zhitongcaijing.com/")
result.fetched = True
result.status_code = status
result.title = extract_title(html)
date_match = re.search(r"(20\d{2}-\d{2}-\d{2})\s+\d{2}:\d{2}:\d{2}", html)
result.latest_date = date_match.group(1) if date_match else extract_first_date(html)
result.direct_table_access = False
result.pagination_access = False
result.realtime_hint = False
t2 = "T+2" in html or "T2" in html
delayed = "延迟数据" in html or "T+2日结算" in html
result.notes.extend(
[
"站点可访问,但当前命中的是资讯文章页。",
"页面语义更偏新闻/统计解读,不是可直接分页拉取的标准数据表。",
]
)
if t2 or delayed:
result.notes.append("页面明确指向 T+2 或延迟数据。")
result.extracted = {
"contains_t_plus_2": t2,
"contains_delay_notice": delayed,
"contains_southbound": "南向资金" in html,
}
except (HTTPError, URLError, TimeoutError) as exc:
result.error = f"{type(exc).__name__}: {exc}"
return result
def probe_wind() -> ProbeResult:
url = "https://www.wind.com.cn/portal/zh/WFT/index.html"
result = ProbeResult(source="wind", page_url=url, fetched=False)
try:
status, html = fetch(url, referer="https://www.wind.com.cn/")
result.fetched = True
result.status_code = status
result.title = extract_title(html)
result.latest_date = extract_first_date(html)
result.direct_table_access = False
result.pagination_access = False
result.realtime_hint = "API" in html or "Client API" in html
result.notes.extend(
[
"官方产品页可访问。",
"当前拿到的是产品介绍页,不是公开南向资金网页数据表。",
]
)
if result.realtime_hint:
result.notes.append("页面包含 API/客户端能力描述,说明数据更可能通过授权终端或接口获取。")
result.extracted = {
"contains_client_api": "Client API" in html,
"contains_excel_plugin": "Excel" in html,
"contains_terminal": "金融终端" in html,
}
except (HTTPError, URLError, TimeoutError) as exc:
result.error = f"{type(exc).__name__}: {exc}"
return result
def probe_ths_reference() -> ProbeResult:
result = ProbeResult(
source="ths_reference",
page_url="https://data.10jqka.com.cn/hgt/ggtb/",
fetched=False,
)
try:
status, html = fetch_gbk(result.page_url, referer="https://data.10jqka.com.cn/")
status_page_2, html_page_2 = fetch_gbk(
"https://data.10jqka.com.cn/hgt/ggtb/board/getGgtPage/page/2/",
referer=result.page_url,
)
result.fetched = True
result.status_code = status
result.title = extract_title(html)
result.latest_date = extract_first_date(html)
result.direct_table_access = "<table class=\"m-table J-ajax-table\">" in html
result.pagination_access = "<table class=\"m-table J-ajax-table\">" in html_page_2 and status_page_2 == 200
result.realtime_hint = False
result.notes.extend(
[
"同花顺公开 HTML 历史表可抓取。",
"分页可通过非 ajax=1 直达 URL 访问。",
]
)
result.extracted = {
"page_info": extract_page_info(html),
"page_2_first_date": extract_first_date(html_page_2),
"latest_date": result.latest_date,
}
except (HTTPError, URLError, TimeoutError) as exc:
result.error = f"{type(exc).__name__}: {exc}"
return result
def main() -> None:
results = [
probe_eastmoney(),
probe_zhitong(),
probe_wind(),
probe_ths_reference(),
]
payload = {
"generated_at_utc": datetime.now(timezone.utc).isoformat(),
"results": [asdict(item) for item in results],
}
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(str(OUTPUT_PATH))
if __name__ == "__main__":
main()