#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Full SoA pipeline:

- Input: text file with company numbers (one per line)
- For each company:
    - Fetch SoA filings from Companies House API
    - Download SoA PDF from document API
    - (Optional) repair PDFs with qpdf/mutool/ghostscript
    - (Optional) OCR with ocrmypdf if mostly image
    - Convert to TXT with pdftotext
    - Parse "B- Company Creditors" blocks
    - Normalise creditor names (HMRC variants → HMRC, etc.)
    - Append rows to:
        - docs.csv      → document metadata
        - creditors.csv → creditor rows with normalised names
    - Delete PDFs (raw + repaired + OCR variants)

Outputs layout:

out_docs/
    pdfs/              # temporary PDFs (emptied per document)
    txt/               # TXT files (kept)
    logs/soa_docs.log  # run log
    docs.csv           # document-level info
    creditors.csv      # creditor-level rows

Chunk / batch controls (built-in):

You can now control which part of the input list is processed using:

    --offset N   → skip first N companies in the input file
    --limit M    → process at most M companies after offset (0 = all)

Examples:

    # Process first 1000 companies
    python3 full.py --input all_companies.txt --offset 0 --limit 1000

    # Process next 1000 (companies 1000–1999)
    python3 full.py --input all_companies.txt --offset 1000 --limit 1000

For daily small runs (~300/day), you can just use a short input file
or rely on --limit, e.g.:

    python3 full.py --input all_companies.txt --limit 300
"""

import os
import re
import csv
import sys
import time
import json
import shutil
import signal
import logging
import argparse
import subprocess
import base64
from pathlib import Path
from typing import List, Dict, Optional, Tuple

import requests
import pdfplumber


# ===========================
# CONFIG / GLOBALS
# ===========================

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "soa-pipeline/1.0",
    "Accept": "application/json",
})

CH_API_KEY = os.getenv("CH_API_KEY", "")

API_BASE = os.getenv("CH_API_BASE", "https://api.company-information.service.gov.uk")
DOC_API_BASE = os.getenv("CH_DOC_API_BASE", "https://document-api.company-information.service.gov.uk")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("soa_pipeline")

SHOULD_STOP = False


def _handle_stop(signum, frame):
    global SHOULD_STOP
    SHOULD_STOP = True


signal.signal(signal.SIGINT, _handle_stop)
signal.signal(signal.SIGTERM, _handle_stop)

DEFAULT_RPS = 1.5
REQUESTS_PER_SECOND = DEFAULT_RPS


# ===========================
# HTTP / AUTH HELPERS
# ===========================

def _ensure_api_key_and_header():
    key = (CH_API_KEY or "").strip()
    if not key:
        log.error("CH_API_KEY not set. export CH_API_KEY=your_public_data_key")
        sys.exit(2)

    token = base64.b64encode((key + ":").encode("utf-8")).decode("ascii")
    SESSION.headers["Authorization"] = f"Basic {token}"

    url = f"{API_BASE}/company/00000006/filing-history?items_per_page=1"
    try:
        r = SESSION.get(url, timeout=20)
        if r.status_code == 401:
            log.error("401 Unauthorized on self-test. Use a Public Data API Key (not Streaming).")
            sys.exit(3)
        r.raise_for_status()
    except requests.RequestException as e:
        log.warning("Auth self-test warning: %s", e)


def pace():
    time.sleep(max(0.0, 1.0 / REQUESTS_PER_SECOND))


class CH429Error(Exception):
    pass


def _get(url: str, **kwargs) -> requests.Response:
    pace()
    timeout = kwargs.pop("timeout", 30)
    headers = kwargs.pop("headers", {})
    merged = {**SESSION.headers, **headers}
    resp = SESSION.get(url, headers=merged, timeout=timeout, **kwargs)

    if resp.status_code == 429:
        ra = resp.headers.get("Retry-After")
        if ra:
            try:
                wait_s = int(ra)
                log.warning("429 received. Sleeping %ss (Retry-After).", wait_s)
                time.sleep(wait_s)
            except ValueError:
                pass
        raise CH429Error("Hit Companies House rate limit (429).")

    if resp.status_code == 401:
        log.error("401 Unauthorized for URL: %s", url)

    resp.raise_for_status()
    return resp


# ===========================
# UTILS
# ===========================

def abs_api_url(path_or_url: str) -> str:
    if not path_or_url:
        return ""
    if path_or_url.startswith(("http://", "https://")):
        return path_or_url
    if path_or_url.startswith("/"):
        return f"{API_BASE}{path_or_url}"
    return f"{API_BASE}/{path_or_url}"


def abs_doc_url(path_or_url: str) -> str:
    if not path_or_url:
        return ""
    if path_or_url.startswith(("http://", "https://")):
        return path_or_url
    if path_or_url.startswith("/"):
        return f"{DOC_API_BASE}{path_or_url}"
    return f"{DOC_API_BASE}/{path_or_url}"


def safe_num(company_number: str) -> str:
    return re.sub(r"\D", "", company_number or "")


def safe_filename(s: str) -> str:
    s = re.sub(r"[^\w\-. ]+", "_", s)
    s = re.sub(r"\s+", " ", s).strip(" ._")
    s = s.replace(" ", "_")
    return s or "file"


def cmd_exists(bin_name: str) -> bool:
    return shutil.which(bin_name) is not None


# ===========================
# COMPANIES HOUSE HELPERS
# ===========================

def filing_history(company_number: str, items_per_page: int = 250) -> List[Dict]:
    url = f"{API_BASE}/company/{company_number}/filing-history?items_per_page={items_per_page}"
    r = _get(url)
    data = r.json()
    return data.get("items", [])


def is_soa_filing(item: Dict) -> bool:
    cat = (item.get("category") or "").lower()
    ftype = (item.get("type") or "").lower()
    desc = (item.get("description") or "").lower()

    if "insolvency" in cat and ("statement of affairs" in desc or ftype in {"liq02"}):
        return True

    dv = item.get("description_values")
    if isinstance(dv, dict):
        blob = json.dumps(dv).lower()
        if "statement of affairs" in blob:
            return True

    if "statement" in desc and "affairs" in desc:
        return True

    return False


def company_name(company_number: str) -> str:
    url = f"{API_BASE}/company/{company_number}"
    try:
        r = _get(url)
        data = r.json()
        return (data.get("company_name") or "").strip()
    except Exception as e:
        log.warning("Company %s: failed to fetch profile (%s)", company_number, e)
        return ""


def filing_history_for_company(company_number: str) -> List[Dict]:
    items = filing_history(company_number)
    return [it for it in items if is_soa_filing(it)]


def get_document_download_url_from_metadata(meta_url: str) -> Tuple[str, str]:
    meta_abs = abs_doc_url(meta_url)
    if not meta_abs:
        raise RuntimeError("Document metadata URL is empty or invalid.")

    r = _get(meta_abs)
    meta = r.json()

    resources = meta.get("resources") or {}
    pdf_res = resources.get("application/pdf")
    url = None
    if isinstance(pdf_res, dict):
        url = pdf_res.get("url")

    if not url:
        links = meta.get("links") or {}
        url = links.get("document")

    if not url:
        links = meta.get("links") or {}
        self_link = links.get("self")
        if self_link:
            url = self_link.rstrip("/") + "/content"

    if not url:
        raise RuntimeError(f"No downloadable URL in document metadata from {meta_abs}")

    content_abs = abs_doc_url(url)
    filename = meta.get("original_filename") or "document.pdf"
    return content_abs, filename


def fetch_document_pdf(download_url: str) -> bytes:
    if not download_url:
        raise RuntimeError("Download URL is empty.")
    r = _get(download_url, headers={"Accept": "application/pdf"})
    return r.content


# ===========================
# PDF PRE-PROCESS (REPAIR/OCR)
# ===========================

def repair_pdf(pdf_in: Path, pdf_out: Path, prefer: str = "qpdf") -> Path:
    work = pdf_in
    tried = False

    try:
        if prefer == "qpdf" and cmd_exists("qpdf"):
            tried = True
            tmp = pdf_in.with_suffix(".qpdf.pdf")
            subprocess.run(
                ["qpdf", "--linearize", str(pdf_in), str(tmp)],
                check=True,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
            work = tmp

        if cmd_exists("mutool"):
            tried = True
            tmp2 = pdf_in.with_suffix(".mutool.pdf")
            subprocess.run(
                ["mutool", "clean", "-gg", "-i", "-s", str(work), str(tmp2)],
                check=True,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
            work = tmp2

        if cmd_exists("gs"):
            tried = True
            tmp3 = pdf_in.with_suffix(".gs.pdf")
            subprocess.run(
                ["gs", "-q", "-o", str(tmp3), "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/prepress", str(work)],
                check=True,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
            work = tmp3

    except subprocess.CalledProcessError as e:
        log.warning("PDF repair step warning: %s", e)
        if not pdf_out.exists():
            return pdf_in
    except Exception as e:
        log.warning("PDF repair step warning: %s", e)
        return pdf_in

    if tried:
        try:
            if pdf_out != work:
                shutil.copy2(work, pdf_out)
            return pdf_out
        finally:
            pass

    return pdf_in


def page_has_text(plumber_page) -> bool:
    try:
        txt = plumber_page.extract_text() or ""
        return bool(txt.strip())
    except Exception:
        return False


def pdf_is_mostly_image(pdf_path: Path, sample_pages: int = 5) -> bool:
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            n = min(sample_pages, len(pdf.pages))
            textful = 0
            for i in range(n):
                if page_has_text(pdf.pages[i]):
                    textful += 1
            return textful == 0
    except Exception:
        return True


def ocrmypdf_available() -> bool:
    return cmd_exists("ocrmypdf")


def run_ocrmypdf(in_pdf: Path, out_pdf: Path, force: bool) -> bool:
    args = ["ocrmypdf", "--jobs", "2", "--quiet"]
    if force:
        args.append("--force-ocr")
    args += [str(in_pdf), str(out_pdf)]

    try:
        subprocess.run(args, check=True)
        return out_pdf.exists()
    except subprocess.CalledProcessError as e:
        if e.returncode in (130, 137):
            log.warning("ocrmypdf interrupted (code %s).", e.returncode)
            return False
        log.warning("ocrmypdf failed: %s", e)
        return False
    except Exception as e:
        log.warning("ocrmypdf failed: %s", e)
        return False


def preprocess_pdf(pdf_path: Path, force_ocr: bool, repair: bool, dpi: int, prefer_repair: str) -> Path:
    work = pdf_path

    if repair:
        repaired = pdf_path.with_suffix(".repaired.pdf")
        work = repair_pdf(pdf_path, repaired, prefer=prefer_repair)

    needs_ocr = force_ocr or pdf_is_mostly_image(work)
    if needs_ocr and ocrmypdf_available():
        ocr_out = work.with_suffix(".ocr.pdf")
        did = run_ocrmypdf(work, ocr_out, force=True)
        if did:
            work = ocr_out

    return work


def run_pdftotext(pdf_path: Path, txt_path: Path) -> bool:
    if not cmd_exists("pdftotext"):
        log.error("pdftotext not found. Install poppler-utils.")
        return False

    try:
        subprocess.run(
            ["pdftotext", "-layout", str(pdf_path), str(txt_path)],
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        return txt_path.exists()
    except subprocess.CalledProcessError as e:
        log.warning("pdftotext failed: %s", e)
        return False
    except Exception as e:
        log.warning("pdftotext failed: %s", e)
        return False


# ===========================
# IO HELPERS
# ===========================

def ensure_dirs(base_out: Path) -> Tuple[Path, Path, Path]:
    pdf_dir = base_out / "pdfs"
    txt_dir = base_out / "txt"
    log_dir = base_out / "logs"
    pdf_dir.mkdir(parents=True, exist_ok=True)
    txt_dir.mkdir(parents=True, exist_ok=True)
    log_dir.mkdir(parents=True, exist_ok=True)
    return pdf_dir, txt_dir, log_dir


def write_docs_csv_row(csv_path: Path, row: Dict[str, str]):
    write_header = not csv_path.exists()
    with csv_path.open("a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(
            f,
            fieldnames=["company_number", "company_name", "filing_id", "pdf_filename", "txt_filename"],
        )
        if write_header:
            w.writeheader()
        w.writerow(row)


def write_creditors_csv_rows(csv_path: Path, rows: List[Dict[str, str]]):
    if not rows:
        return

    fieldnames = [
        "company_number",
        "company_name",
        "filing_id",
        "txt_filename",
        "key",
        "name",
        "normalised_name",
        "address",
        "amount",
    ]
    write_header = not csv_path.exists()

    with csv_path.open("a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        if write_header:
            w.writeheader()
        for r in rows:
            w.writerow(r)


def cleanup_pdfs(pdf_dir: Path, base_stem: str):
    patterns = [
        f"{base_stem}.pdf",
        f"{base_stem}.repaired.pdf",
        f"{base_stem}.qpdf.pdf",
        f"{base_stem}.mutool.pdf",
        f"{base_stem}.gs.pdf",
        f"{base_stem}.ocr.pdf",
    ]
    for pat in patterns:
        p = pdf_dir / pat
        if p.exists():
            try:
                p.unlink()
            except Exception as e:
                log.warning("Failed to delete %s (%s)", p, e)


# ===========================
# NAME NORMALISATION
# ===========================

def normalise_creditor_name(name: str) -> str:
    raw = (name or "").strip()
    if not raw:
        return raw

    lower = raw.lower()
    compact = re.sub(r"[^a-z0-9]+", " ", lower)
    compact = re.sub(r"\s+", " ", compact).strip()
    tokens = set(compact.split())

    # HMRC mapping
    if "hmrc" in compact:
        return "HMRC"
    if "hm" in tokens and "revenue" in tokens and "customs" in tokens:
        return "HMRC"
    if "hmrc" in tokens and "vat" in tokens:
        return "HMRC"
    if "hmrc" in tokens and "paye" in tokens:
        return "HMRC"
    if "vat" in compact:
        return "HMRC"
    if "paye" in compact:
        return "HMRC"

    # TSB BANK mapping
    if "tsb" in tokens and "bank" in tokens and "ple" in tokens:
        return "TSB BANK"
    if "tsb" in tokens and "bank" in tokens and "plc" in tokens:
        return "TSB BANK"
    if "tsb" in tokens and "bank" in tokens:
        return "TSB BANK"
    if "tsb" in compact:
        return "TSB BANK"

    # Barclays Bank mapping
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "uk" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "uk" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "uk" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "uk" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens and "(business" in tokens and "insolvency)" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "pic" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "ple" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "pic" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "uk" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "credit" in tokens and "card" in tokens:
        return "Barclays Bank"
    if "barclays" in tokens and "bank" in tokens and "plc." in tokens:
        return "Barclays Bank"

    # Natwest Bank mapping
    if "natwest" in compact:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "-" in tokens and "credit" in tokens and "card" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "-" in tokens and "bounce" in tokens and "back" in tokens and "loan" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "plc" in tokens and "-" in tokens and "bbls" in tokens:
        return "Natwest Bank"
    if "natwest" in compact:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "plc" in tokens and "-" in tokens and "bbls" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "-" in tokens and "credit" in tokens and "card" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "-" in tokens and "bounce" in tokens and "back" in tokens and "loan" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "pic" in tokens:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "national" in tokens and "westminster" in tokens and "bank" in tokens and "plc" in tokens:
        return "Natwest Bank"
    if "natwest" in tokens and "bank" in tokens:
        return "Natwest Bank"

    # British Gas mapping
    if "british" in tokens and "gas" in tokens and "(business)" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "trading" in tokens and "limited" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "commerical" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "lite" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "trading" in tokens and "limited" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "business" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "electric" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "trading" in tokens and "ltd" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "lite" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens and "business" in tokens:
        return "British Gas"
    if "british" in tokens and "gas" in tokens:
        return "British Gas"

    # Santander Bank mapping
    if "santander" in tokens and "uk" in tokens and "plc-" in tokens and "bbl" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "bank" in tokens and "pic" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "bank" in tokens and "plo" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "business" in tokens and "banking" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "uk" in tokens and "ple" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "uk" in tokens and "plc" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "buisness" in tokens and "banking" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "uk" in tokens and "pic" in tokens:
        return "Santander Bank"
    if "santander" in tokens and "bank" in tokens:
        return "Santander Bank"

    # Fedex mapping
    if "fedex" in compact:
        return "Fedex"
    if "fedex" in compact:
        return "Fedex"

    # Eon Energy mapping
    if "eon" in tokens and "next" in tokens:
        return "Eon Energy"
    if "eon" in tokens and "next" in tokens:
        return "Eon Energy"
    if "eon" in tokens and "energy" in tokens:
        return "Eon Energy"
    if "eon" in compact:
        return "Eon Energy"
    if "eon" in compact:
        return "Eon Energy"

    # Lloyds Bank mapping
    if "lloyds" in tokens and "bank" in tokens and "pic" in tokens:
        return "Lloyds Bank"
    if "lloyds" in tokens and "bank" in tokens and "plc" in tokens:
        return "Lloyds Bank"
    if "lloyds" in compact:
        return "Lloyds Bank"
    if "lloyds" in tokens and "bank" in tokens and "plc" in tokens:
        return "Lloyds Bank"
    if "lloyds" in tokens and "bank" in tokens and "plc" in tokens:
        return "Lloyds Bank"
    if "lloyds" in tokens and "bank" in tokens:
        return "Lloyds Bank"

    return raw


# ===========================
# TXT → CREDITORS PARSING
# ===========================

def extract_creditors_from_txt(
    txt: str,
    company_number: str,
    company_name: str,
    filing_id: str,
    txt_filename: str,
) -> List[Dict[str, str]]:
    header_pattern = re.compile(r"B\s*-\s*Company\s+Creditors", re.IGNORECASE)
    entries_totalling_pattern = re.compile(r"Entries\s+Totalling", re.IGNORECASE)
    line_pattern = re.compile(
        r"^\s*([A-Za-z0-9]+)\s+(.*?)(\d{1,3}(?:,\d{3})*(?:\.\d{2}))\s*\.?\s*$"
    )

    creditors: List[Dict[str, str]] = []

    for match in header_pattern.finditer(txt):
        start = match.start()

        end = txt.find("Entries Totalling", start)
        if end == -1:
            end = txt.find("\x0c", start)
            if end == -1:
                end = len(txt)

        block = txt[start:end]

        for line in block.splitlines():
            if entries_totalling_pattern.search(line):
                continue

            m = line_pattern.match(line)
            if not m:
                continue

            key, body, amount = m.groups()

            parts = re.split(
                pattern=r"\s{2,}",
                string=body.strip(),
                maxsplit=1,
            )
            name = parts[0].strip()
            address = parts[1].strip() if len(parts) > 1 else ""
            normalised_name = normalise_creditor_name(name)

            creditors.append(
                {
                    "company_number": company_number,
                    "company_name": company_name,
                    "filing_id": filing_id,
                    "txt_filename": txt_filename,
                    "key": key,
                    "name": name,
                    "normalised_name": normalised_name,
                    "address": address,
                    "amount": amount,
                }
            )

    return creditors


# ===========================
# CORE PIPELINE
# ===========================

def process_company(
    company_number: str,
    outdir: Path,
    docs_csv: Path,
    creditors_csv: Path,
    force_ocr: bool,
    repair: bool,
    ocr_dpi: int,
    prefer_repair: str,
):
    if SHOULD_STOP:
        return

    company_number = safe_num(company_number)
    if not company_number:
        return

    pdf_dir, txt_dir, _ = ensure_dirs(outdir)

    cname = company_name(company_number)
    log.info("Company %s: %s", company_number, cname or "(no name)")

    log.info("Company %s: fetching filing history…", company_number)
    soa_items = filing_history_for_company(company_number)
    if not soa_items:
        log.info("Company %s: no Statement of Affairs filings found.", company_number)
        return

    for it in soa_items:
        if SHOULD_STOP:
            return

        links = it.get("links") or {}
        meta_url = links.get("document_metadata") or links.get("document")
        if not meta_url:
            log.warning(
                "Company %s: missing document metadata link (filing_id=%s).",
                company_number,
                it.get("transaction_id") or it.get("barcode"),
            )
            continue

        filing_id = it.get("transaction_id") or it.get("barcode") or "unknown"

        log.info("Company %s: downloading SoA (filing_id=%s)…", company_number, filing_id)
        try:
            download_url, filename_hint = get_document_download_url_from_metadata(meta_url)
            pdf_bytes = fetch_document_pdf(download_url)
        except Exception as e:
            log.error("Company %s: download error (%s)", company_number, e)
            continue

        base_stem = safe_filename(f"{company_number}_{filing_id}_SoA")
        pdf_filename = f"{base_stem}.pdf"
        txt_filename = f"{base_stem}.txt"

        pdf_path = pdf_dir / pdf_filename
        txt_path = txt_dir / txt_filename

        with pdf_path.open("wb") as f:
            f.write(pdf_bytes)

        prepped_pdf = preprocess_pdf(
            pdf_path,
            force_ocr=force_ocr,
            repair=repair,
            dpi=ocr_dpi,
            prefer_repair=prefer_repair,
        )

        log.info("Company %s: converting to text (filing_id=%s)…", company_number, filing_id)
        ok = run_pdftotext(prepped_pdf, txt_path)
        if not ok:
            log.warning(
                "Company %s: pdftotext failed; keeping PDF for inspection (%s).",
                company_number,
                pdf_path,
            )
            continue

        write_docs_csv_row(
            docs_csv,
            {
                "company_number": company_number,
                "company_name": cname,
                "filing_id": filing_id,
                "pdf_filename": pdf_filename,
                "txt_filename": txt_filename,
            },
        )

        try:
            txt_content = txt_path.read_text(encoding="utf-8", errors="ignore")
        except Exception as e:
            log.warning(
                "Company %s: failed reading TXT for creditors (filing_id=%s): %s",
                company_number,
                filing_id,
                e,
            )
            txt_content = ""

        creditors_rows = []
        if txt_content:
            creditors_rows = extract_creditors_from_txt(
                txt_content,
                company_number=company_number,
                company_name=cname,
                filing_id=filing_id,
                txt_filename=txt_filename,
            )

        if creditors_rows:
            write_creditors_csv_rows(creditors_csv, creditors_rows)
            log.info(
                "Company %s: extracted %d creditors from SoA (filing_id=%s).",
                company_number,
                len(creditors_rows),
                filing_id,
            )
        else:
            log.info(
                "Company %s: no creditors found in SoA TXT (filing_id=%s).",
                company_number,
                filing_id,
            )

        cleanup_pdfs(pdf_dir, base_stem)
        log.info(
            "Company %s: TXT=%s written, PDFs removed.",
            company_number,
            txt_filename,
        )


# ===========================
# MAIN
# ===========================

def main():
    global REQUESTS_PER_SECOND

    parser = argparse.ArgumentParser(
        description=(
            "Download Companies House SoA PDFs, OCR to TXT, "
            "extract 'B- Company Creditors' into CSV with normalised names, and delete PDFs."
        )
    )
    parser.add_argument(
        "--input",
        required=True,
        help="File with company numbers (one per line)",
    )
    parser.add_argument(
        "--outdir",
        default="out_docs",
        help="Base output directory (pdfs/txt/logs)",
    )
    parser.add_argument(
        "--docs-csv",
        default="out_docs/docs.csv",
        help="Docs CSV output path",
    )
    parser.add_argument(
        "--creditors-csv",
        default="out_docs/creditors.csv",
        help="Creditors CSV output path",
    )
    parser.add_argument(
        "--rps",
        type=float,
        default=DEFAULT_RPS,
        help="Requests per second throttle",
    )
    parser.add_argument(
        "--force-ocr",
        action="store_true",
        help="Force OCR on all PDFs (even if text is detected)",
    )
    parser.add_argument(
        "--repair",
        action="store_true",
        help="Try to repair PDFs with qpdf/mutool/ghostscript before OCR",
    )
    parser.add_argument(
        "--prefer-repair",
        choices=["qpdf", "mutool", "gs"],
        default="qpdf",
        help="Preferred first repair tool",
    )
    parser.add_argument(
        "--dpi",
        type=int,
        default=300,
        help="Reserved DPI param (for future rasterisation use)",
    )
    parser.add_argument(
        "--offset",
        type=int,
        default=0,
        help="Number of companies to skip from the input file before processing (for chunked runs)",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=0,
        help="Maximum number of companies to process after offset (0 = no limit)",
    )
    args = parser.parse_args()

    REQUESTS_PER_SECOND = max(0.5, args.rps)
    _ensure_api_key_and_header()

    outdir = Path(args.outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    _, _, log_dir = ensure_dirs(outdir)
    logfile = log_dir / "soa_docs.log"
    fh = logging.FileHandler(logfile, encoding="utf-8")
    fh.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
    log.addHandler(fh)

    docs_csv = Path(args.docs_csv)
    creditors_csv = Path(args.creditors_csv)

    with open(args.input, "r", encoding="utf-8") as f:
        companies = [ln.strip() for ln in f if ln.strip()]

    total = len(companies)
    start = max(0, args.offset)
    if start >= total:
        log.warning(
            "Offset %d is >= total companies %d. Nothing to do.",
            start,
            total,
        )
        return

    if args.limit and args.limit > 0:
        end = min(total, start + args.limit)
    else:
        end = total

    companies_slice = companies[start:end]
    log.info(
        "Loaded %d company numbers, processing %d (offset=%d, limit=%d, range=[%d,%d))",
        total,
        len(companies_slice),
        start,
        args.limit,
        start,
        end,
    )

    for cn in companies_slice:
        if SHOULD_STOP:
            break
        try:
            process_company(
                cn,
                outdir,
                docs_csv,
                creditors_csv,
                force_ocr=args.force_ocr,
                repair=args.repair,
                ocr_dpi=args.dpi,
                prefer_repair=args.prefer_repair,
            )
        except Exception as e:
            log.error("Company %s failed: %s", cn, e)

    log.info("Done.")


if __name__ == "__main__":
    main()
