# ch_soa_bulk.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import re
import io
import csv
import sys
import time
import json
import shutil
import signal
import logging
import argparse
import subprocess
import base64
import warnings
from pathlib import Path
from typing import List, Dict, Optional, Tuple

import requests
import pandas as pd
import pdfplumber
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

warnings.filterwarnings("ignore", message=r".*image-based, camelot only works on text-based pages.*")

try:
    import camelot
    CAM_AVAILABLE = True
except Exception:
    CAM_AVAILABLE = False

try:
    import pytesseract
    from pdf2image import convert_from_path
    OCR_FALLBACK_AVAILABLE = True
except Exception:
    OCR_FALLBACK_AVAILABLE = False

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "soa-bulk-extractor/1.4",
    "Accept": "application/json",
})
CH_API_KEY = os.getenv("CH_API_KEY", "")

API_BASE = os.getenv("CH_API_BASE", "https://api.company-information.service.gov.uk")
DOC_API_BASE = os.getenv("CH_DOC_API_BASE", "https://document-api.company-information.service.gov.uk")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("soa_bulk")

SHOULD_STOP = False
def _handle_stop(signum, frame):
    global SHOULD_STOP
    SHOULD_STOP = True
signal.signal(signal.SIGINT, _handle_stop)
signal.signal(signal.SIGTERM, _handle_stop)

DEFAULT_RPS = 1.5
REQUESTS_PER_SECOND = DEFAULT_RPS

def _ensure_api_key_and_header():
    key = (CH_API_KEY or "").strip()
    if not key:
        log.error("CH_API_KEY not set. export CH_API_KEY=your_api_key")
        sys.exit(2)
    token = base64.b64encode((key + ":").encode("utf-8")).decode("ascii")
    SESSION.headers["Authorization"] = f"Basic {token}"
    url = f"{API_BASE}/company/00000006/filing-history?items_per_page=1"
    try:
        r = SESSION.get(url, timeout=20)
        if r.status_code == 401:
            log.error("401 Unauthorized on self-test. Use a Public Data API Key (not Streaming). Re-export CH_API_KEY and retry.")
            sys.exit(3)
        r.raise_for_status()
    except requests.RequestException as e:
        log.warning("Auth self-test warning: %s", e)

def pace():
    time.sleep(max(0.0, 1.0 / REQUESTS_PER_SECOND))

class CH429Error(Exception):
    pass

@retry(
    reraise=True,
    stop=stop_after_attempt(8),
    wait=wait_exponential(multiplier=1, min=2, max=60),
    retry=retry_if_exception_type((requests.RequestException, CH429Error)),
)
def _get(url: str, **kwargs) -> requests.Response:
    pace()
    timeout = kwargs.pop("timeout", 30)
    headers = kwargs.pop("headers", {})
    merged = {**SESSION.headers, **headers}
    resp = SESSION.get(url, headers=merged, timeout=timeout, **kwargs)
    if resp.status_code == 429:
        ra = resp.headers.get("Retry-After")
        if ra:
            try:
                wait_s = int(ra)
                log.warning("429 received. Sleeping %ss (Retry-After).", wait_s)
                time.sleep(wait_s)
            except ValueError:
                pass
        raise CH429Error("Hit CH rate limit (429).")
    if resp.status_code == 401:
        log.error("401 Unauthorized for URL: %s", url)
    resp.raise_for_status()
    return resp

def abs_api_url(path_or_url: str) -> str:
    if not path_or_url:
        return ""
    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
        return path_or_url
    if path_or_url.startswith("/"):
        return f"{API_BASE}{path_or_url}"
    return f"{API_BASE}/{path_or_url}"

def abs_doc_url(path_or_url: str) -> str:
    if not path_or_url:
        return ""
    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
        return path_or_url
    if path_or_url.startswith("/"):
        return f"{DOC_API_BASE}{path_or_url}"
    return f"{DOC_API_BASE}/{path_or_url}"

def safe_num(company_number: str) -> str:
    return re.sub(r"\D", "", company_number or "")

def safe_filename(s: str) -> str:
    s = re.sub(r"[^\w\-. ]+", "_", s)
    s = re.sub(r"\s+", "_", s).strip("._")
    return s or "file"

def filing_history(company_number: str, items_per_page: int = 250) -> List[Dict]:
    url = f"{API_BASE}/company/{company_number}/filing-history?items_per_page={items_per_page}"
    r = _get(url)
    data = r.json()
    return data.get("items", [])

def is_soa_filing(item: Dict) -> bool:
    cat = (item.get("category") or "").lower()
    ftype = (item.get("type") or "").lower()
    desc = (item.get("description") or "").lower()
    if "insolvency" in cat and ("statement of affairs" in desc or ftype in {"liq02"}):
        return True
    dv = item.get("description_values")
    if isinstance(dv, dict):
        blob = json.dumps(dv).lower()
        if "statement of affairs" in blob:
            return True
    if "statement" in desc and "affairs" in desc:
        return True
    return False

def get_document_download_url_from_metadata(meta_url: str) -> Tuple[str, str]:
    meta_abs = abs_doc_url(meta_url)
    if not meta_abs:
        raise RuntimeError("Document metadata URL is empty or invalid.")
    r = _get(meta_abs)
    meta = r.json()
    resources = meta.get("resources") or {}
    pdf_res = resources.get("application/pdf")
    url = None
    if isinstance(pdf_res, dict):
        url = pdf_res.get("url")
    if not url:
        links = meta.get("links") or {}
        url = links.get("document")
    if not url:
        links = meta.get("links") or {}
        self_link = links.get("self")
        if self_link:
            url = self_link.rstrip("/") + "/content"
    if not url:
        raise RuntimeError(f"No downloadable URL in document metadata from {meta_abs}")
    content_abs = abs_doc_url(url)
    filename = meta.get("original_filename") or "document.pdf"
    return content_abs, filename

def fetch_document_pdf(download_url: str) -> bytes:
    if not download_url:
        raise RuntimeError("Download URL is empty.")
    r = _get(download_url, headers={"Accept": "application/pdf"})
    return r.content

def cmd_exists(bin_name: str) -> bool:
    return shutil.which(bin_name) is not None

def repair_pdf(pdf_in: Path, pdf_out: Path, prefer: str = "qpdf") -> Path:
    work = pdf_in
    tried = False
    try:
        if prefer == "qpdf" and cmd_exists("qpdf"):
            tried = True
            tmp = pdf_in.with_suffix(".qpdf.pdf")
            subprocess.run(["qpdf", "--linearize", str(pdf_in), str(tmp)],
                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            work = tmp
        if cmd_exists("mutool"):
            tried = True
            tmp2 = pdf_in.with_suffix(".mutool.pdf")
            subprocess.run(["mutool", "clean", "-gg", "-i", "-s", str(work), str(tmp2)],
                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            work = tmp2
        if cmd_exists("gs"):
            tried = True
            tmp3 = pdf_in.with_suffix(".gs.pdf")
            subprocess.run(["gs", "-q", "-o", str(tmp3), "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/prepress", str(work)],
                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            work = tmp3
    except subprocess.CalledProcessError as e:
        log.warning("PDF repair step warning: %s", e)
        if not pdf_out.exists():
            return pdf_in
    except Exception as e:
        log.warning("PDF repair step warning: %s", e)
        return pdf_in
    if tried:
        try:
            if pdf_out != work:
                shutil.copy2(work, pdf_out)
            return pdf_out
        finally:
            pass
    return pdf_in

def ocrmypdf_available() -> bool:
    return cmd_exists("ocrmypdf")

def run_ocrmypdf(in_pdf: Path, out_pdf: Path, force: bool) -> bool:
    args = ["ocrmypdf", "--jobs", "2", "--quiet"]
    if force:
        args.append("--force-ocr")
    args += [str(in_pdf), str(out_pdf)]
    try:
        subprocess.run(args, check=True)
        return out_pdf.exists()
    except subprocess.CalledProcessError as e:
        if e.returncode in (130, 137):
            log.warning("ocrmypdf interrupted (code %s). Skipping OCR for this file.", e.returncode)
            return False
        log.warning("ocrmypdf failed: %s", e)
        return False
    except Exception as e:
        log.warning("ocrmypdf failed: %s", e)
        return False

def page_has_text(plumber_page) -> bool:
    try:
        txt = plumber_page.extract_text() or ""
        return bool(txt.strip())
    except Exception:
        return False

def pdf_is_mostly_image(pdf_path: Path, sample_pages: int = 5) -> bool:
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            n = min(sample_pages, len(pdf.pages))
            textful = 0
            for i in range(n):
                if page_has_text(pdf.pages[i]):
                    textful += 1
            return textful == 0
    except Exception:
        return True

def fallback_ocr_with_tesseract(pdf_in: Path, pdf_out: Path, dpi: int = 300) -> Path:
    if not OCR_FALLBACK_AVAILABLE:
        return pdf_in
    try:
        images = convert_from_path(str(pdf_in), dpi=dpi)
        parts = []
        for idx, img in enumerate(images, start=1):
            data = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
            part = pdf_in.with_name(f"{pdf_in.stem}.tess.{idx:03d}.pdf")
            with open(part, "wb") as f:
                f.write(data)
            parts.append(part)
        if cmd_exists("qpdf") and len(parts) > 1:
            subprocess.run(["qpdf", "--empty", "--pages", *map(str, parts), "--", str(pdf_out)], check=True)
        else:
            shutil.move(str(parts[0]), str(pdf_out))
        return pdf_out if pdf_out.exists() else pdf_in
    except subprocess.CalledProcessError as e:
        log.warning("Tesseract OCR fallback failed: %s", e)
        return pdf_in
    except Exception as e:
        log.warning("Tesseract OCR fallback failed: %s", e)
        return pdf_in

def preprocess_pdf(pdf_path: Path, force_ocr: bool, repair: bool, dpi: int, prefer_repair: str) -> Path:
    work = pdf_path
    if repair:
        repaired = pdf_path.with_suffix(".repaired.pdf")
        work = repair_pdf(pdf_path, repaired, prefer=prefer_repair)
    needs_ocr = force_ocr or pdf_is_mostly_image(work)
    if needs_ocr:
        ocr_out = work.with_suffix(".ocr.pdf")
        did = False
        if ocrmypdf_available():
            did = run_ocrmypdf(work, ocr_out, force=True)
        if did:
            work = ocr_out
        else:
            fb_out = work.with_suffix(".tesseract.pdf")
            work2 = fallback_ocr_with_tesseract(work, fb_out, dpi=dpi)
            if work2.exists():
                work = work2
    return work

MONEY_RX = re.compile(r"(?:(?:£|\bGBP\s*)?\s*)([-+]?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?)")
KEYWORDS = re.compile(r"(creditor|schedule|statement\s+of\s+affairs|deficiency|liabilities|preferential|unsecured|trade\s+and\s+expense)", re.I)

def normalize_money(s: str) -> Optional[float]:
    if s is None:
        return None
    m = MONEY_RX.search(str(s))
    if not m:
        return None
    try:
        return float(m.group(1).replace(",", ""))
    except Exception:
        return None

def looks_like_creditor_table(df: pd.DataFrame) -> bool:
    cols = " ".join([str(c).lower() for c in df.columns])
    has_name = any(k in cols for k in ["creditor", "name", "supplier"])
    has_amt  = any(re.search(r"(amount|claim|debt|balance|total|£|gbp)", c, re.I) for c in map(str, df.columns))
    return has_name and has_amt

def find_candidate_pages(pdf_path: Path, max_pages: int = 250) -> List[int]:
    idxs = []
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            n = min(len(pdf.pages), max_pages)
            for i in range(n):
                try:
                    txt = pdf.pages[i].extract_text() or ""
                    if KEYWORDS.search(txt):
                        idxs.append(i)
                except Exception:
                    continue
    except Exception:
        return []
    return idxs

def pdf_to_tables(pdf_path: Path, candidate_pages: Optional[List[int]] = None, use_camelot: bool = True) -> List[pd.DataFrame]:
    dfs: List[pd.DataFrame] = []
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            pages = range(len(pdf.pages)) if candidate_pages is None else candidate_pages
            for i in pages:
                p = pdf.pages[i]
                try:
                    t = p.extract_table()
                    if t and len(t) > 1 and len(t[0]) > 1:
                        df = pd.DataFrame(t[1:], columns=[str(x).strip() for x in t[0]])
                        dfs.append(df)
                except Exception:
                    continue
    except Exception:
        pass
    if use_camelot and CAM_AVAILABLE and not dfs:
        pages = "all" if candidate_pages is None else ",".join(str(i+1) for i in candidate_pages)
        try:
            tables = camelot.read_pdf(str(pdf_path), flavor="lattice", pages=pages)
            for t in tables:
                if t.df.shape[0] >= 2 and t.df.shape[1] >= 2:
                    dfs.append(t.df)
        except Exception:
            pass
        if not dfs:
            try:
                tables = camelot.read_pdf(str(pdf_path), flavor="stream", pages=pages)
                for t in tables:
                    if t.df.shape[0] >= 2 and t.df.shape[1] >= 2:
                        dfs.append(t.df)
            except Exception:
                pass
    return dfs

def extract_creditors_from_dfs(dfs: List[pd.DataFrame], min_rows: int = 1) -> List[Dict]:
    out = []
    for df in dfs:
        df = df.copy()
        df.columns = [str(c).strip() for c in df.columns]
        if not looks_like_creditor_table(df):
            if df.shape[0] > 1:
                header = df.iloc[0].tolist()
                df2 = df.iloc[1:].copy()
                df2.columns = [str(h).strip() for h in header]
                if looks_like_creditor_table(df2):
                    df = df2
                else:
                    continue
            else:
                continue
        amount_cols = [c for c in df.columns if re.search(r"(amount|claim|debt|balance|total|£|gbp)", str(c), re.I)]
        if not amount_cols:
            amount_cols = [df.columns[-1]]
        for _, r in df.iterrows():
            name = None
            for c in df.columns:
                if re.search(r"(creditor|name|supplier)", str(c), re.I):
                    name = str(r.get(c) or "").strip()
                    if name:
                        break
            if not name and len(df.columns):
                name = str(r.get(df.columns[0]) or "").strip()
            amt = None
            for c in amount_cols:
                amt = normalize_money(r.get(c))
                if amt is not None:
                    break
            if name and amt is not None:
                out.append({"creditor": name, "amount": float(amt)})
    return out if len(out) >= min_rows else []

def extract_creditors_from_text(pdf_path: Path, candidate_pages: Optional[List[int]] = None, max_lines: int = 2000) -> Tuple[List[Dict], Optional[float]]:
    rows: List[Dict] = []
    detected_total: Optional[float] = None
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            pages = range(len(pdf.pages)) if candidate_pages is None else candidate_pages
            for i in pages:
                try:
                    txt = pdf.pages[i].extract_text() or ""
                except Exception:
                    continue
                if not txt:
                    continue
                count = 0
                for line in txt.splitlines():
                    if count > max_lines:
                        break
                    if not line.strip():
                        continue
                    if re.search(r"\btotal\b", line, re.I) and detected_total is None:
                        maybe = normalize_money(line)
                        if maybe is not None:
                            detected_total = maybe
                    m = re.search(r"^(?P<name>.+?)\s+((£|\bGBP\b)\s*)?(?P<amt>[-+]?\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?)\s*$", line.strip())
                    if m:
                        name = m.group("name").strip()
                        amt = normalize_money(m.group("amt"))
                        if name and amt is not None and not re.search(r"\btotal\b", name, re.I):
                            rows.append({"creditor": name, "amount": float(amt)})
                    count += 1
    except Exception:
        return [], None
    return rows, detected_total

def detect_total_from_rows_or_sum(dfs: List[pd.DataFrame], creditors: List[Dict], text_total: Optional[float]) -> Optional[float]:
    if text_total is not None:
        return float(text_total)
    for df in dfs:
        try:
            for _, r in df.iterrows():
                row_str = " ".join([str(x) for x in r.values])
                if re.search(r"\btotal\b", row_str, re.I):
                    for v in list(r.values)[::-1]:
                        nm = normalize_money(v)
                        if nm is not None:
                            return float(nm)
        except Exception:
            continue
    if creditors:
        return round(sum(x["amount"] for x in creditors), 2)
    return None

def ensure_dirs(base_out: Path, company: str) -> Path:
    pdf_dir = base_out / "pdfs" / company
    pdf_dir.mkdir(parents=True, exist_ok=True)
    (base_out / "logs").mkdir(parents=True, exist_ok=True)
    return pdf_dir

def write_csv_rows(rows: List[Dict], out_csv: Path):
    if not rows:
        return
    write_header = not out_csv.exists()
    with out_csv.open("a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(
            f,
            fieldnames=["company_number", "filing_id", "statement_date", "creditor", "amount", "detected_total"]
        )
        if write_header:
            w.writeheader()
        for r in rows:
            w.writerow(r)

def filing_history_for_company(company_number: str) -> List[Dict]:
    items = filing_history(company_number)
    return [it for it in items if is_soa_filing(it)]

def process_company(company_number: str, outdir: Path, csv_path: Path, force_ocr: bool, repair: bool, ocr_dpi: int, prefer_repair: str, use_camelot: bool, text_scan: bool, max_pages_scan: int, min_rows: int):
    if SHOULD_STOP:
        return
    company_number = safe_num(company_number)
    if not company_number:
        return

    log.info("Company %s: fetching filing history…", company_number)
    soa_items = filing_history_for_company(company_number)
    if not soa_items:
        log.info("Company %s: no Statement of Affairs filings found.", company_number)
        return

    pdf_dir = ensure_dirs(outdir, company_number)

    for it in soa_items:
        if SHOULD_STOP:
            return

        links = it.get("links") or {}
        meta_url = links.get("document_metadata") or links.get("document")
        if not meta_url:
            log.warning("Company %s: missing document metadata link (filing_id=%s).", company_number, it.get("transaction_id") or it.get("barcode"))
            continue

        filing_id = it.get("transaction_id") or it.get("barcode") or "unknown"
        statement_date = it.get("date") or it.get("action_date") or it.get("received_date") or ""
        statement_date = statement_date[:10]

        log.info("Company %s: downloading SoA (filing_id=%s)…", company_number, filing_id)
        try:
            download_url, filename_hint = get_document_download_url_from_metadata(meta_url)
            pdf_bytes = fetch_document_pdf(download_url)
        except Exception as e:
            log.error("Company %s: download error (%s)", company_number, e)
            continue

        fname = safe_filename(f"{company_number}_SoA_{statement_date}_{filename_hint}")
        pdf_path = pdf_dir / fname
        with pdf_path.open("wb") as f:
            f.write(pdf_bytes)

        prepped_pdf = preprocess_pdf(pdf_path, force_ocr=force_ocr, repair=repair, dpi=ocr_dpi, prefer_repair=prefer_repair)

        candidate_pages = find_candidate_pages(prepped_pdf, max_pages=max_pages_scan)
        if not candidate_pages:
            candidate_pages = None

        log.info("Company %s: extracting creditor tables…", company_number)
        dfs = pdf_to_tables(prepped_pdf, candidate_pages=candidate_pages, use_camelot=use_camelot)
        creditors = extract_creditors_from_dfs(dfs, min_rows=min_rows)

        text_total: Optional[float] = None
        if (not creditors or text_scan) and not SHOULD_STOP:
            text_rows, text_total = extract_creditors_from_text(prepped_pdf, candidate_pages=candidate_pages)
            if text_rows:
                creditors = creditors + text_rows

        total = detect_total_from_rows_or_sum(dfs, creditors, text_total)

        rows = []
        for c in creditors:
            rows.append({
                "company_number": company_number,
                "filing_id": filing_id,
                "statement_date": statement_date,
                "creditor": c["creditor"],
                "amount": f"{c['amount']:.2f}",
                "detected_total": f"{total:.2f}" if total is not None else "",
            })
        write_csv_rows(rows, csv_path)

        log.info("Company %s: creditors=%d total=%s → %s",
                 company_number, len(creditors),
                 f"{total:.2f}" if total is not None else "n/a",
                 csv_path)

def main():
    global REQUESTS_PER_SECOND

    parser = argparse.ArgumentParser(description="Bulk download SoAs and extract creditors to CSV (local-only).")
    parser.add_argument("--input", required=True, help="File with company numbers (one per line)")
    parser.add_argument("--outdir", default="out", help="Base output directory (PDFs + logs)")
    parser.add_argument("--csv", default="out/debts.csv", help="CSV output path")
    parser.add_argument("--rps", type=float, default=DEFAULT_RPS, help="Requests per second throttle")
    parser.add_argument("--force-ocr", action="store_true", help="Force OCR on all PDFs")
    parser.add_argument("--repair", action="store_true", help="Try to repair PDFs with qpdf/mutool/ghostscript before OCR")
    parser.add_argument("--prefer-repair", choices=["qpdf","mutool","gs"], default="qpdf", help="Preferred first repair tool")
    parser.add_argument("--dpi", type=int, default=300, help="DPI for fallback OCR rasterization")
    parser.add_argument("--no-camelot", action="store_true", help="Disable Camelot fallback (use pdfplumber/text only)")
    parser.add_argument("--text-scan", action="store_true", help="Always run text-based extraction even if tables found")
    parser.add_argument("--max-pages", type=int, default=250, help="Max pages to scan for keywords")
    parser.add_argument("--min-rows", type=int, default=1, help="Minimum rows required to accept a table extraction")
    args = parser.parse_args()

    REQUESTS_PER_SECOND = max(0.5, args.rps)
    _ensure_api_key_and_header()

    outdir = Path(args.outdir)
    outdir.mkdir(parents=True, exist_ok=True)
    csv_path = Path(args.csv)
    with open(args.input, "r", encoding="utf-8") as f:
        companies = [ln.strip() for ln in f if ln.strip()]

    for cn in companies:
        if SHOULD_STOP:
            break
        try:
            process_company(
                cn, outdir, csv_path,
                force_ocr=args.force_ocr,
                repair=args.repair,
                ocr_dpi=args.dpi,
                prefer_repair=args.prefer_repair,
                use_camelot=not args.no_camelot,
                text_scan=args.text_scan,
                max_pages_scan=args.max_pages,
                min_rows=args.min_rows
            )
        except Exception as e:
            log.error("Company %s failed: %s", cn, e)

    log.info("Done.")

if __name__ == "__main__":
    main()