Files
zjjk/backend/app/services/capital_image_service.py
2026-04-08 20:04:40 +08:00

264 lines
10 KiB
Python

import base64
import json
import re
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from uuid import uuid4
from fastapi import HTTPException, UploadFile
from app.core.config import BASE_DIR, CAPITAL_IMAGE_DB_FILE, CAPITAL_IMAGE_UPLOADS_DIR
from app.repositories.monitoring_repository import MonitoringRepository
from app.repositories.capital_image_repository import CapitalImageRepository
def _extract_json_block(content: str) -> dict:
fenced_match = re.search(r"```json\s*(\{.*?\})\s*```", content, flags=re.DOTALL)
if fenced_match:
return json.loads(fenced_match.group(1))
object_match = re.search(r"(\{.*\})", content, flags=re.DOTALL)
if object_match:
return json.loads(object_match.group(1))
raise ValueError("No JSON object found in model output")
class CapitalImageService:
def __init__(self) -> None:
self.repository = CapitalImageRepository(CAPITAL_IMAGE_DB_FILE)
self.monitoring_repository = MonitoringRepository()
def list_records(self, trade_date: str | None = None, subject: str | None = None) -> dict:
items = [
self._serialize_record(record)
for record in self.repository.list_records(trade_date=trade_date, subject=subject)
]
return {"items": items, "total": len(items)}
def get_record(self, record_id: str) -> dict:
record = self.repository.get_record(record_id)
if record is None:
raise HTTPException(status_code=404, detail="Record not found")
return self._serialize_record(record)
async def create_record(
self,
upload_file: UploadFile,
trade_date: str | None = None,
subject: str | None = None,
) -> dict:
suffix = Path(upload_file.filename or "upload.jpg").suffix or ".jpg"
record_id = uuid4().hex
image_name = upload_file.filename or f"{record_id}{suffix}"
stored_path = CAPITAL_IMAGE_UPLOADS_DIR / f"{record_id}{suffix.lower()}"
binary = await upload_file.read()
stored_path.parent.mkdir(parents=True, exist_ok=True)
stored_path.write_bytes(binary)
extraction = self._extract_from_image(
image_bytes=binary,
original_filename=image_name,
stored_path=stored_path,
trade_date=trade_date,
subject=subject,
)
now = datetime.now().isoformat(timespec="seconds")
payload = {
"id": record_id,
"trade_date": extraction.get("trade_date") or trade_date,
"subject": extraction.get("subject") or subject,
"snapshot_time": extraction.get("snapshot_time"),
"main_force_amount_yi": extraction.get("main_force_amount_yi"),
"institution_amount_yi": extraction.get("institution_amount_yi"),
"large_household_amount_yi": extraction.get("large_household_amount_yi"),
"retail_amount_yi": extraction.get("retail_amount_yi"),
"overall_trend": extraction.get("overall_trend"),
"intraday_summary": extraction.get("intraday_summary"),
"review_status": extraction.get("review_status", "pending_review"),
"extraction_method": extraction.get("extraction_method", "fallback"),
"image_name": image_name,
"image_path": str(stored_path),
"raw_extraction": extraction,
"created_at": now,
"updated_at": now,
}
record = self.repository.insert_record(payload)
return {"item": self._serialize_record(record)}
def _extract_from_image(
self,
image_bytes: bytes,
original_filename: str,
stored_path: Path,
trade_date: str | None,
subject: str | None,
) -> dict:
llm_config = self._get_llm_config()
if llm_config["api_key"]:
try:
return self._extract_via_model(
image_bytes=image_bytes,
trade_date=trade_date,
subject=subject,
llm_config=llm_config,
)
except Exception as exc: # pragma: no cover
return {
**self._build_fallback_payload(original_filename, trade_date, subject),
"review_status": "pending_review",
"extraction_method": "fallback_after_model_error",
"model_error": str(exc),
}
sidecar_payload = self._load_sidecar_payload(original_filename)
if sidecar_payload is not None:
return sidecar_payload
return self._build_fallback_payload(original_filename, trade_date, subject)
def _extract_via_model(
self,
image_bytes: bytes,
trade_date: str | None,
subject: str | None,
llm_config: dict,
) -> dict:
api_key = llm_config["api_key"]
base_url = llm_config["base_url"].rstrip("/")
model = llm_config["model"]
encoded_image = base64.b64encode(image_bytes).decode("utf-8")
prompt = """
You are extracting structured data from a Chinese stock capital flow screenshot.
Return only JSON with these keys:
trade_date, subject, snapshot_time, main_force_amount_yi, institution_amount_yi,
large_household_amount_yi, retail_amount_yi, overall_trend, intraday_summary,
review_status, extraction_method.
Rules:
1. intraday_summary must describe only the intraday capital-flow trend, not repeat raw numbers.
2. overall_trend should be a short phrase like "震荡上行", "冲高回落", "弱势下探", "午后修复".
3. If a number is not clearly visible, set it to null.
4. review_status should be "extracted".
5. extraction_method should be "vision_model".
6. If trade_date is absent in the image, keep null.
"""
payload = {
"model": model,
"messages": [
{
"role": "system",
"content": "You extract structured JSON from Chinese capital-flow screenshots."
},
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}",
},
},
],
}
],
}
request = urllib.request.Request(
url=f"{base_url}/chat/completions",
data=json.dumps(payload).encode("utf-8"),
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=180) as response:
response_payload = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc: # pragma: no cover
error_text = exc.read().decode("utf-8", errors="ignore")
raise RuntimeError(f"Model request failed: {error_text}") from exc
choices = response_payload.get("choices", [])
content = ""
if choices:
content = choices[0].get("message", {}).get("content", "")
parsed = _extract_json_block(content)
if subject and not parsed.get("subject"):
parsed["subject"] = subject
if trade_date and not parsed.get("trade_date"):
parsed["trade_date"] = trade_date
return parsed
def _get_llm_config(self) -> dict:
config = self.monitoring_repository.get_system_config()
return {
"provider": config.get("llm_provider", "openai_compatible"),
"api_key": config.get("llm_api_key", ""),
"base_url": config.get("llm_base_url", "https://api.openai.com/v1"),
"model": config.get("llm_vision_model", "gpt-4.1-mini"),
}
def _load_sidecar_payload(self, original_filename: str) -> dict | None:
candidate_paths = [
BASE_DIR.parent / "zijin" / f"{Path(original_filename).stem}.json",
BASE_DIR / "data" / "capital_images" / f"{Path(original_filename).stem}.json",
]
for candidate in candidate_paths:
if candidate.exists():
payload = json.loads(candidate.read_text(encoding="utf-8"))
capital_flow = payload.get("capital_flow_amounts", {})
overall_trend = payload.get("overall_trend", {})
intraday_summary = overall_trend.get("summary") or payload.get("llm_summary")
return {
"trade_date": payload.get("date"),
"subject": payload.get("subject"),
"snapshot_time": payload.get("snapshot_time"),
"main_force_amount_yi": capital_flow.get("main_force_yi"),
"institution_amount_yi": capital_flow.get("institution_yi"),
"large_household_amount_yi": capital_flow.get("large_household_yi"),
"retail_amount_yi": capital_flow.get("retail_yi"),
"overall_trend": overall_trend.get("direction"),
"intraday_summary": intraday_summary,
"review_status": "sidecar_loaded",
"extraction_method": "sidecar_json",
"sidecar_path": str(candidate),
}
return None
def _build_fallback_payload(
self,
original_filename: str,
trade_date: str | None,
subject: str | None,
) -> dict:
return {
"trade_date": trade_date,
"subject": subject,
"snapshot_time": None,
"main_force_amount_yi": None,
"institution_amount_yi": None,
"large_household_amount_yi": None,
"retail_amount_yi": None,
"overall_trend": "待识别",
"intraday_summary": "当前未配置视觉模型,图片已保存,待接入大模型后补充日内资金走势总结。",
"review_status": "pending_review",
"extraction_method": "storage_only",
"fallback_reason": f"No vision model configured for {original_filename}",
}
def _serialize_record(self, record: dict) -> dict:
return {
**record,
"image_url": f"/capital-images/uploads/{Path(record['image_path']).name}",
}
capital_image_service = CapitalImageService()