235 lines
8.6 KiB
Python
235 lines
8.6 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from dataclasses import asdict, dataclass, field
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
from urllib.error import HTTPError, URLError
|
||
from urllib.request import Request, urlopen
|
||
|
||
|
||
ROOT = Path(__file__).resolve().parents[1]
|
||
OUTPUT_PATH = ROOT / "backend" / "data" / "source_probe_results.json"
|
||
|
||
|
||
DEFAULT_HEADERS = {
|
||
"User-Agent": "Mozilla/5.0",
|
||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class ProbeResult:
|
||
source: str
|
||
page_url: str
|
||
fetched: bool
|
||
status_code: int | None = None
|
||
title: str | None = None
|
||
latest_date: str | None = None
|
||
direct_table_access: bool = False
|
||
pagination_access: bool = False
|
||
realtime_hint: bool = False
|
||
notes: list[str] = field(default_factory=list)
|
||
error: str | None = None
|
||
extracted: dict[str, Any] = field(default_factory=dict)
|
||
|
||
|
||
def fetch(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
|
||
headers = dict(DEFAULT_HEADERS)
|
||
if referer:
|
||
headers["Referer"] = referer
|
||
request = Request(url, headers=headers)
|
||
with urlopen(request, timeout=timeout) as response:
|
||
content_type = response.headers.get_content_charset() or "utf-8"
|
||
raw = response.read()
|
||
try:
|
||
html = raw.decode(content_type, "ignore")
|
||
except LookupError:
|
||
html = raw.decode("utf-8", "ignore")
|
||
return response.status, html
|
||
|
||
|
||
def fetch_gbk(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
|
||
headers = dict(DEFAULT_HEADERS)
|
||
if referer:
|
||
headers["Referer"] = referer
|
||
request = Request(url, headers=headers)
|
||
with urlopen(request, timeout=timeout) as response:
|
||
return response.status, response.read().decode("gbk", "ignore")
|
||
|
||
|
||
def extract_title(html: str) -> str | None:
|
||
match = re.search(r"<title>(.*?)</title>", html, re.S | re.I)
|
||
if not match:
|
||
return None
|
||
return re.sub(r"\s+", " ", match.group(1)).strip()
|
||
|
||
|
||
def extract_first_date(html: str) -> str | None:
|
||
match = re.search(r"<td>(20\d{2}-\d{2}-\d{2})</td>", html)
|
||
if match:
|
||
return match.group(1)
|
||
match = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
|
||
return match.group(1) if match else None
|
||
|
||
|
||
def extract_page_info(html: str) -> str | None:
|
||
match = re.search(r'<span class="page_info">([^<]+)</span>', html)
|
||
return match.group(1).strip() if match else None
|
||
|
||
|
||
def probe_eastmoney() -> ProbeResult:
|
||
result = ProbeResult(
|
||
source="eastmoney",
|
||
page_url="https://data.eastmoney.com/hsgtV2/hsgtDetail/scgkDetail_nx.html",
|
||
fetched=False,
|
||
)
|
||
try:
|
||
status, html = fetch(result.page_url, referer="https://data.eastmoney.com/")
|
||
result.fetched = True
|
||
result.status_code = status
|
||
result.title = extract_title(html)
|
||
date_match = re.search(r"更新时间[:: ]*</[^>]+>\s*<[^>]*>(20\d{2}-\d{2}-\d{2})", html)
|
||
fallback_date = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
|
||
result.latest_date = date_match.group(1) if date_match else (fallback_date.group(1) if fallback_date else None)
|
||
result.direct_table_access = "南向资金" in html and "港股通(沪)" in html and "港股通(深)" in html
|
||
result.pagination_access = "南向历史" in html or "历史数据" in html
|
||
result.realtime_hint = "成交净买额(当日)" in html or "实时" in html
|
||
result.notes.extend(
|
||
[
|
||
"公开页面可访问。",
|
||
"页面文案包含南向资金、港股通(沪)、港股通(深)。",
|
||
]
|
||
)
|
||
if result.latest_date:
|
||
result.notes.append(f"页面中检出日期 {result.latest_date}。")
|
||
result.extracted = {
|
||
"contains_southbound": "南向资金" in html,
|
||
"contains_shanghai": "港股通(沪)" in html,
|
||
"contains_shenzhen": "港股通(深)" in html,
|
||
"contains_today_net_buy": "成交净买额(当日)" in html,
|
||
}
|
||
except (HTTPError, URLError, TimeoutError) as exc:
|
||
result.error = f"{type(exc).__name__}: {exc}"
|
||
return result
|
||
|
||
|
||
def probe_zhitong() -> ProbeResult:
|
||
url = "https://www.zhitongcaijing.com/content/detail/1295067.html"
|
||
result = ProbeResult(source="zhitong", page_url=url, fetched=False)
|
||
try:
|
||
status, html = fetch(url, referer="https://www.zhitongcaijing.com/")
|
||
result.fetched = True
|
||
result.status_code = status
|
||
result.title = extract_title(html)
|
||
date_match = re.search(r"(20\d{2}-\d{2}-\d{2})\s+\d{2}:\d{2}:\d{2}", html)
|
||
result.latest_date = date_match.group(1) if date_match else extract_first_date(html)
|
||
result.direct_table_access = False
|
||
result.pagination_access = False
|
||
result.realtime_hint = False
|
||
t2 = "T+2" in html or "T+2" in html
|
||
delayed = "延迟数据" in html or "T+2日结算" in html
|
||
result.notes.extend(
|
||
[
|
||
"站点可访问,但当前命中的是资讯文章页。",
|
||
"页面语义更偏新闻/统计解读,不是可直接分页拉取的标准数据表。",
|
||
]
|
||
)
|
||
if t2 or delayed:
|
||
result.notes.append("页面明确指向 T+2 或延迟数据。")
|
||
result.extracted = {
|
||
"contains_t_plus_2": t2,
|
||
"contains_delay_notice": delayed,
|
||
"contains_southbound": "南向资金" in html,
|
||
}
|
||
except (HTTPError, URLError, TimeoutError) as exc:
|
||
result.error = f"{type(exc).__name__}: {exc}"
|
||
return result
|
||
|
||
|
||
def probe_wind() -> ProbeResult:
|
||
url = "https://www.wind.com.cn/portal/zh/WFT/index.html"
|
||
result = ProbeResult(source="wind", page_url=url, fetched=False)
|
||
try:
|
||
status, html = fetch(url, referer="https://www.wind.com.cn/")
|
||
result.fetched = True
|
||
result.status_code = status
|
||
result.title = extract_title(html)
|
||
result.latest_date = extract_first_date(html)
|
||
result.direct_table_access = False
|
||
result.pagination_access = False
|
||
result.realtime_hint = "API" in html or "Client API" in html
|
||
result.notes.extend(
|
||
[
|
||
"官方产品页可访问。",
|
||
"当前拿到的是产品介绍页,不是公开南向资金网页数据表。",
|
||
]
|
||
)
|
||
if result.realtime_hint:
|
||
result.notes.append("页面包含 API/客户端能力描述,说明数据更可能通过授权终端或接口获取。")
|
||
result.extracted = {
|
||
"contains_client_api": "Client API" in html,
|
||
"contains_excel_plugin": "Excel" in html,
|
||
"contains_terminal": "金融终端" in html,
|
||
}
|
||
except (HTTPError, URLError, TimeoutError) as exc:
|
||
result.error = f"{type(exc).__name__}: {exc}"
|
||
return result
|
||
|
||
|
||
def probe_ths_reference() -> ProbeResult:
|
||
result = ProbeResult(
|
||
source="ths_reference",
|
||
page_url="https://data.10jqka.com.cn/hgt/ggtb/",
|
||
fetched=False,
|
||
)
|
||
try:
|
||
status, html = fetch_gbk(result.page_url, referer="https://data.10jqka.com.cn/")
|
||
status_page_2, html_page_2 = fetch_gbk(
|
||
"https://data.10jqka.com.cn/hgt/ggtb/board/getGgtPage/page/2/",
|
||
referer=result.page_url,
|
||
)
|
||
result.fetched = True
|
||
result.status_code = status
|
||
result.title = extract_title(html)
|
||
result.latest_date = extract_first_date(html)
|
||
result.direct_table_access = "<table class=\"m-table J-ajax-table\">" in html
|
||
result.pagination_access = "<table class=\"m-table J-ajax-table\">" in html_page_2 and status_page_2 == 200
|
||
result.realtime_hint = False
|
||
result.notes.extend(
|
||
[
|
||
"同花顺公开 HTML 历史表可抓取。",
|
||
"分页可通过非 ajax=1 直达 URL 访问。",
|
||
]
|
||
)
|
||
result.extracted = {
|
||
"page_info": extract_page_info(html),
|
||
"page_2_first_date": extract_first_date(html_page_2),
|
||
"latest_date": result.latest_date,
|
||
}
|
||
except (HTTPError, URLError, TimeoutError) as exc:
|
||
result.error = f"{type(exc).__name__}: {exc}"
|
||
return result
|
||
|
||
|
||
def main() -> None:
|
||
results = [
|
||
probe_eastmoney(),
|
||
probe_zhitong(),
|
||
probe_wind(),
|
||
probe_ths_reference(),
|
||
]
|
||
payload = {
|
||
"generated_at_utc": datetime.now(timezone.utc).isoformat(),
|
||
"results": [asdict(item) for item in results],
|
||
}
|
||
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
OUTPUT_PATH.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
print(str(OUTPUT_PATH))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|