235 lines
8.6 KiB
Python
235 lines
8.6 KiB
Python
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
from dataclasses import asdict, dataclass, field
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any
|
|||
|
|
from urllib.error import HTTPError, URLError
|
|||
|
|
from urllib.request import Request, urlopen
|
|||
|
|
|
|||
|
|
|
|||
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|||
|
|
OUTPUT_PATH = ROOT / "backend" / "data" / "source_probe_results.json"
|
|||
|
|
|
|||
|
|
|
|||
|
|
DEFAULT_HEADERS = {
|
|||
|
|
"User-Agent": "Mozilla/5.0",
|
|||
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class ProbeResult:
|
|||
|
|
source: str
|
|||
|
|
page_url: str
|
|||
|
|
fetched: bool
|
|||
|
|
status_code: int | None = None
|
|||
|
|
title: str | None = None
|
|||
|
|
latest_date: str | None = None
|
|||
|
|
direct_table_access: bool = False
|
|||
|
|
pagination_access: bool = False
|
|||
|
|
realtime_hint: bool = False
|
|||
|
|
notes: list[str] = field(default_factory=list)
|
|||
|
|
error: str | None = None
|
|||
|
|
extracted: dict[str, Any] = field(default_factory=dict)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fetch(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
|
|||
|
|
headers = dict(DEFAULT_HEADERS)
|
|||
|
|
if referer:
|
|||
|
|
headers["Referer"] = referer
|
|||
|
|
request = Request(url, headers=headers)
|
|||
|
|
with urlopen(request, timeout=timeout) as response:
|
|||
|
|
content_type = response.headers.get_content_charset() or "utf-8"
|
|||
|
|
raw = response.read()
|
|||
|
|
try:
|
|||
|
|
html = raw.decode(content_type, "ignore")
|
|||
|
|
except LookupError:
|
|||
|
|
html = raw.decode("utf-8", "ignore")
|
|||
|
|
return response.status, html
|
|||
|
|
|
|||
|
|
|
|||
|
|
def fetch_gbk(url: str, *, referer: str | None = None, timeout: int = 20) -> tuple[int, str]:
|
|||
|
|
headers = dict(DEFAULT_HEADERS)
|
|||
|
|
if referer:
|
|||
|
|
headers["Referer"] = referer
|
|||
|
|
request = Request(url, headers=headers)
|
|||
|
|
with urlopen(request, timeout=timeout) as response:
|
|||
|
|
return response.status, response.read().decode("gbk", "ignore")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_title(html: str) -> str | None:
|
|||
|
|
match = re.search(r"<title>(.*?)</title>", html, re.S | re.I)
|
|||
|
|
if not match:
|
|||
|
|
return None
|
|||
|
|
return re.sub(r"\s+", " ", match.group(1)).strip()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_first_date(html: str) -> str | None:
|
|||
|
|
match = re.search(r"<td>(20\d{2}-\d{2}-\d{2})</td>", html)
|
|||
|
|
if match:
|
|||
|
|
return match.group(1)
|
|||
|
|
match = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
|
|||
|
|
return match.group(1) if match else None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_page_info(html: str) -> str | None:
|
|||
|
|
match = re.search(r'<span class="page_info">([^<]+)</span>', html)
|
|||
|
|
return match.group(1).strip() if match else None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def probe_eastmoney() -> ProbeResult:
|
|||
|
|
result = ProbeResult(
|
|||
|
|
source="eastmoney",
|
|||
|
|
page_url="https://data.eastmoney.com/hsgtV2/hsgtDetail/scgkDetail_nx.html",
|
|||
|
|
fetched=False,
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
status, html = fetch(result.page_url, referer="https://data.eastmoney.com/")
|
|||
|
|
result.fetched = True
|
|||
|
|
result.status_code = status
|
|||
|
|
result.title = extract_title(html)
|
|||
|
|
date_match = re.search(r"更新时间[:: ]*</[^>]+>\s*<[^>]*>(20\d{2}-\d{2}-\d{2})", html)
|
|||
|
|
fallback_date = re.search(r"(20\d{2}-\d{2}-\d{2})", html)
|
|||
|
|
result.latest_date = date_match.group(1) if date_match else (fallback_date.group(1) if fallback_date else None)
|
|||
|
|
result.direct_table_access = "南向资金" in html and "港股通(沪)" in html and "港股通(深)" in html
|
|||
|
|
result.pagination_access = "南向历史" in html or "历史数据" in html
|
|||
|
|
result.realtime_hint = "成交净买额(当日)" in html or "实时" in html
|
|||
|
|
result.notes.extend(
|
|||
|
|
[
|
|||
|
|
"公开页面可访问。",
|
|||
|
|
"页面文案包含南向资金、港股通(沪)、港股通(深)。",
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
if result.latest_date:
|
|||
|
|
result.notes.append(f"页面中检出日期 {result.latest_date}。")
|
|||
|
|
result.extracted = {
|
|||
|
|
"contains_southbound": "南向资金" in html,
|
|||
|
|
"contains_shanghai": "港股通(沪)" in html,
|
|||
|
|
"contains_shenzhen": "港股通(深)" in html,
|
|||
|
|
"contains_today_net_buy": "成交净买额(当日)" in html,
|
|||
|
|
}
|
|||
|
|
except (HTTPError, URLError, TimeoutError) as exc:
|
|||
|
|
result.error = f"{type(exc).__name__}: {exc}"
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def probe_zhitong() -> ProbeResult:
|
|||
|
|
url = "https://www.zhitongcaijing.com/content/detail/1295067.html"
|
|||
|
|
result = ProbeResult(source="zhitong", page_url=url, fetched=False)
|
|||
|
|
try:
|
|||
|
|
status, html = fetch(url, referer="https://www.zhitongcaijing.com/")
|
|||
|
|
result.fetched = True
|
|||
|
|
result.status_code = status
|
|||
|
|
result.title = extract_title(html)
|
|||
|
|
date_match = re.search(r"(20\d{2}-\d{2}-\d{2})\s+\d{2}:\d{2}:\d{2}", html)
|
|||
|
|
result.latest_date = date_match.group(1) if date_match else extract_first_date(html)
|
|||
|
|
result.direct_table_access = False
|
|||
|
|
result.pagination_access = False
|
|||
|
|
result.realtime_hint = False
|
|||
|
|
t2 = "T+2" in html or "T+2" in html
|
|||
|
|
delayed = "延迟数据" in html or "T+2日结算" in html
|
|||
|
|
result.notes.extend(
|
|||
|
|
[
|
|||
|
|
"站点可访问,但当前命中的是资讯文章页。",
|
|||
|
|
"页面语义更偏新闻/统计解读,不是可直接分页拉取的标准数据表。",
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
if t2 or delayed:
|
|||
|
|
result.notes.append("页面明确指向 T+2 或延迟数据。")
|
|||
|
|
result.extracted = {
|
|||
|
|
"contains_t_plus_2": t2,
|
|||
|
|
"contains_delay_notice": delayed,
|
|||
|
|
"contains_southbound": "南向资金" in html,
|
|||
|
|
}
|
|||
|
|
except (HTTPError, URLError, TimeoutError) as exc:
|
|||
|
|
result.error = f"{type(exc).__name__}: {exc}"
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def probe_wind() -> ProbeResult:
|
|||
|
|
url = "https://www.wind.com.cn/portal/zh/WFT/index.html"
|
|||
|
|
result = ProbeResult(source="wind", page_url=url, fetched=False)
|
|||
|
|
try:
|
|||
|
|
status, html = fetch(url, referer="https://www.wind.com.cn/")
|
|||
|
|
result.fetched = True
|
|||
|
|
result.status_code = status
|
|||
|
|
result.title = extract_title(html)
|
|||
|
|
result.latest_date = extract_first_date(html)
|
|||
|
|
result.direct_table_access = False
|
|||
|
|
result.pagination_access = False
|
|||
|
|
result.realtime_hint = "API" in html or "Client API" in html
|
|||
|
|
result.notes.extend(
|
|||
|
|
[
|
|||
|
|
"官方产品页可访问。",
|
|||
|
|
"当前拿到的是产品介绍页,不是公开南向资金网页数据表。",
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
if result.realtime_hint:
|
|||
|
|
result.notes.append("页面包含 API/客户端能力描述,说明数据更可能通过授权终端或接口获取。")
|
|||
|
|
result.extracted = {
|
|||
|
|
"contains_client_api": "Client API" in html,
|
|||
|
|
"contains_excel_plugin": "Excel" in html,
|
|||
|
|
"contains_terminal": "金融终端" in html,
|
|||
|
|
}
|
|||
|
|
except (HTTPError, URLError, TimeoutError) as exc:
|
|||
|
|
result.error = f"{type(exc).__name__}: {exc}"
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def probe_ths_reference() -> ProbeResult:
|
|||
|
|
result = ProbeResult(
|
|||
|
|
source="ths_reference",
|
|||
|
|
page_url="https://data.10jqka.com.cn/hgt/ggtb/",
|
|||
|
|
fetched=False,
|
|||
|
|
)
|
|||
|
|
try:
|
|||
|
|
status, html = fetch_gbk(result.page_url, referer="https://data.10jqka.com.cn/")
|
|||
|
|
status_page_2, html_page_2 = fetch_gbk(
|
|||
|
|
"https://data.10jqka.com.cn/hgt/ggtb/board/getGgtPage/page/2/",
|
|||
|
|
referer=result.page_url,
|
|||
|
|
)
|
|||
|
|
result.fetched = True
|
|||
|
|
result.status_code = status
|
|||
|
|
result.title = extract_title(html)
|
|||
|
|
result.latest_date = extract_first_date(html)
|
|||
|
|
result.direct_table_access = "<table class=\"m-table J-ajax-table\">" in html
|
|||
|
|
result.pagination_access = "<table class=\"m-table J-ajax-table\">" in html_page_2 and status_page_2 == 200
|
|||
|
|
result.realtime_hint = False
|
|||
|
|
result.notes.extend(
|
|||
|
|
[
|
|||
|
|
"同花顺公开 HTML 历史表可抓取。",
|
|||
|
|
"分页可通过非 ajax=1 直达 URL 访问。",
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
result.extracted = {
|
|||
|
|
"page_info": extract_page_info(html),
|
|||
|
|
"page_2_first_date": extract_first_date(html_page_2),
|
|||
|
|
"latest_date": result.latest_date,
|
|||
|
|
}
|
|||
|
|
except (HTTPError, URLError, TimeoutError) as exc:
|
|||
|
|
result.error = f"{type(exc).__name__}: {exc}"
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> None:
|
|||
|
|
results = [
|
|||
|
|
probe_eastmoney(),
|
|||
|
|
probe_zhitong(),
|
|||
|
|
probe_wind(),
|
|||
|
|
probe_ths_reference(),
|
|||
|
|
]
|
|||
|
|
payload = {
|
|||
|
|
"generated_at_utc": datetime.now(timezone.utc).isoformat(),
|
|||
|
|
"results": [asdict(item) for item in results],
|
|||
|
|
}
|
|||
|
|
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|||
|
|
OUTPUT_PATH.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|||
|
|
print(str(OUTPUT_PATH))
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|