#!/usr/bin/env bash
set -euo pipefail
shopt -s nullglob

# ============================
# SoA Chunk Runner (fixed)
# ============================

BIG_INPUT="all_companies.txt"   # your full list: one company number per line
CHUNK_PREFIX="chunk_"
CHUNK_SIZE=1000                 # companies per chunk file
OUTDIR="out_docs"
DOCS_CSV="$OUTDIR/docs.csv"
CREDITORS_CSV="$OUTDIR/creditors.csv"
VENV_DIR=".venv"
SCRIPT_NAME="full.py"

echo "=== SoA Chunk Runner ==="

# --- sanity checks ---
if [[ ! -f "$BIG_INPUT" ]]; then
  echo "ERROR: $BIG_INPUT not found. Create this file with one company number per line."
  exit 1
fi

if [[ ! -f "$SCRIPT_NAME" ]]; then
  echo "ERROR: $SCRIPT_NAME not found in current directory."
  exit 1
fi

if [[ -z "${CH_API_KEY:-}" ]]; then
  echo "ERROR: CH_API_KEY is not set."
  echo "  export CH_API_KEY=\"YOUR_PUBLIC_DATA_KEY\""
  exit 1
fi

# --- activate venv if present ---
if [[ -d "$VENV_DIR" ]]; then
  echo "Activating virtualenv: $VENV_DIR"
  # shellcheck disable=SC1090
  source "$VENV_DIR/bin/activate"
else
  echo "WARNING: No virtualenv ($VENV_DIR) found. Using system python3."
fi

mkdir -p "$OUTDIR"

echo
echo "Splitting $BIG_INPUT into chunks of $CHUNK_SIZE lines..."
rm -f "${CHUNK_PREFIX}"*
split -l "$CHUNK_SIZE" "$BIG_INPUT" "$CHUNK_PREFIX"

CHUNK_FILES=( ${CHUNK_PREFIX}* )

TOTAL_CHUNKS=${#CHUNK_FILES[@]}
if (( TOTAL_CHUNKS == 0 )); then
  echo "No chunks created. Is $BIG_INPUT empty?"
  exit 1
fi

echo "Created $TOTAL_CHUNKS chunk files:"
printf '  - %s\n' "${CHUNK_FILES[@]}"

# --- process each chunk sequentially ---
i=0
for chunk in "${CHUNK_FILES[@]}"; do
  ((i++))
  echo
  echo "=== Processing chunk $i / $TOTAL_CHUNKS: $chunk ==="
  echo "Head of $chunk:"
  head -n 3 "$chunk" || true

  if [[ ! -s "$chunk" ]]; then
    echo "Chunk $chunk is empty, skipping."
    continue
  fi

  echo "Running: python3 $SCRIPT_NAME --input $chunk --outdir $OUTDIR ..."
  python3 "$SCRIPT_NAME" \
    --input "$chunk" \
    --outdir "$OUTDIR" \
    --docs-csv "$DOCS_CSV" \
    --creditors-csv "$CREDITORS_CSV" \
    --rps 1 \
    --repair \
    --force-ocr || {
      echo "Chunk $chunk failed – check logs in $OUTDIR/logs/soa_docs.log"
      exit 1
    }

  echo "Finished chunk $i: $chunk"
done

echo
echo "========================================"
echo "All chunks processed."
echo "Docs CSV:       $DOCS_CSV"
echo "Creditors CSV:  $CREDITORS_CSV"
echo "TXT directory:  $OUTDIR/txt"
echo "Log file:       $OUTDIR/logs/soa_docs.log"
echo "========================================"
