#!/usr/bin/env bash
set -euo pipefail

# ============================
# SoA Pipeline Installer
# ============================

PROJECT_NAME="soa_pipeline"
VENV_DIR=".venv"
SCRIPT_NAME="full.py"
INPUT_FILE="companies.txt"

echo "=== SoA Pipeline Installer ==="

# --- Check we are in the right place ---
if [[ ! -f "$SCRIPT_NAME" ]]; then
  echo "ERROR: $SCRIPT_NAME not found in current directory."
  echo "Place $SCRIPT_NAME in this folder and run this installer from there."
  exit 1
fi

# --- Detect package manager (Debian/Ubuntu assumed) ---
PKG_MGR=""
if command -v apt-get >/dev/null 2>&1; then
  PKG_MGR="apt-get"
else
  echo "WARNING: apt-get not found. System deps will NOT be auto-installed."
  echo "You must manually install: poppler-utils qpdf ghostscript mupdf-tools ocrmypdf"
fi

# --- Install system dependencies (if apt-get is available) ---
if [[ -n "$PKG_MGR" ]]; then
  echo "Installing system packages via sudo $PKG_MGR..."
  sudo "$PKG_MGR" update -y
  sudo "$PKG_MGR" install -y \
    python3 python3-venv python3-pip \
    poppler-utils qpdf ghostscript mupdf-tools ocrmypdf || {
      echo "WARNING: Some packages may have failed to install. Continuing..."
    }
fi

# --- Check python3 ---
if ! command -v python3 >/dev/null 2>&1; then
  echo "ERROR: python3 not found. Install Python 3 and rerun."
  exit 1
fi

# --- Create virtualenv ---
if [[ ! -d "$VENV_DIR" ]]; then
  echo "Creating virtual environment in $VENV_DIR..."
  python3 -m venv "$VENV_DIR"
else
  echo "Virtual environment $VENV_DIR already exists. Reusing it."
fi

# --- Activate venv and install Python deps ---
# shellcheck disable=SC1090
source "$VENV_DIR/bin/activate"

echo "Upgrading pip..."
pip install --upgrade pip

echo "Installing Python dependencies (requests, pdfplumber)..."
pip install "requests" "pdfplumber"

# --- Basic tool checks ---
missing_tools=()

for bin in pdftotext qpdf gs mutool ocrmypdf; do
  if ! command -v "$bin" >/dev/null 2>&1; then
    missing_tools+=("$bin")
  fi
done

if ((${#missing_tools[@]} > 0)); then
  echo
  echo "WARNING: The following optional tools are missing:"
  printf '  - %s\n' "${missing_tools[@]}"
  echo "The script will still run, but:"
  echo "  * pdftotext is REQUIRED for text extraction"
  echo "  * ocrmypdf/qpdf/mutool/gs improve OCR/repair on bad PDFs"
fi

# --- Create default output directory ---
OUTDIR="out_docs"
mkdir -p "$OUTDIR"

# --- Create sample companies.txt if missing ---
if [[ ! -f "$INPUT_FILE" ]]; then
  echo "Creating sample $INPUT_FILE with demo company 00000006..."
  printf "00000006\n" > "$INPUT_FILE"
fi

# --- Final instructions ---
cat <<EOF

========================================
INSTALL COMPLETE

Next steps:

1) Set your Companies House API key (replace YOUR_KEY_HERE with your real key):

   export CH_API_KEY="YOUR_KEY_HERE"

2) Activate the virtualenv in this folder:

   source $VENV_DIR/bin/activate

3) Run the pipeline with the sample input:

   python3 $SCRIPT_NAME --input $INPUT_FILE --repair --force-ocr

Outputs will go into:

   $OUTDIR/docs.csv
   $OUTDIR/creditors.csv
   $OUTDIR/txt/
   $OUTDIR/logs/soa_docs.log

You can edit $INPUT_FILE to add more company numbers (one per line).
========================================
EOF
