#!/usr/bin/env bash
set -euo pipefail

# ===== CONFIG =====
UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"

# Notice type 2441, from 01/01/2020, 100 per page
# NOTE: 01%2F01%2F2020 is URL-encoded 01/01/2020
BASE_SEARCH_URL="https://www.thegazette.co.uk/insolvency/notice?noticetypes=2441&start-publish-date=01%2F01%2F2020&end-publish-date=&sort-by=&results-page-size=100&results-page="

OUT_DIR="gazette-2441-json"
SLEEP_BETWEEN_PAGES=1   # be polite
SLEEP_BETWEEN_JSON=0.1  # tiny pause between JSON calls

mkdir -p "$OUT_DIR"

echo "=== Fetching first page to detect total notices ==="
first_page_html="$(curl -sS -A "$UA" "${BASE_SEARCH_URL}1")"

# Example text: "1 - 100 of 104017 notices"
total_notices="$(printf '%s\n' "$first_page_html" | grep -oP 'of\s+\K[0-9]+' | head -n1 || echo "")"

if [[ -z "$total_notices" ]]; then
  echo "ERROR: Could not detect total number of notices."
  echo "Check that the HTML structure hasn't changed or that the User-Agent is accepted."
  exit 1
fi

page_size=100
total_pages=$(( (total_notices + page_size - 1) / page_size ))

echo "Found $total_notices notices, page size $page_size → $total_pages pages."
echo "Saving JSON files into: $OUT_DIR"
echo

# Helper to extract IDs from a page of HTML
extract_ids() {
  # article id="item-4997523"
  grep -oP 'article id="item-\K[0-9]+' | sort -u
}

# Process page 1 using the HTML we already fetched
page_num=1
echo "=== Processing page $page_num / $total_pages ==="
ids_on_page="$(printf '%s\n' "$first_page_html" | extract_ids)"

if [[ -z "$ids_on_page" ]]; then
  echo "WARNING: No IDs found on page 1. HTML structure may have changed."
fi

for notice_id in $ids_on_page; do
  json_file="${OUT_DIR}/${notice_id}.json"
  json_url="https://www.thegazette.co.uk/notice/${notice_id}/data.json?view=linked-data"

  if [[ -s "$json_file" ]]; then
    echo "  [skip] $notice_id (already downloaded)"
    continue
  fi

  echo "  [get ] $notice_id → $json_file"
  if ! curl -sS -A "$UA" "$json_url" -o "$json_file"; then
    echo "    ERROR downloading $json_url" >&2
    rm -f "$json_file" || true
  fi
  sleep "$SLEEP_BETWEEN_JSON"
done

sleep "$SLEEP_BETWEEN_PAGES"

# Process remaining pages
for page_num in $(seq 2 "$total_pages"); do
  echo
  echo "=== Processing page $page_num / $total_pages ==="
  page_html="$(curl -sS -A "$UA" "${BASE_SEARCH_URL}${page_num}")"

  ids_on_page="$(printf '%s\n' "$page_html" | extract_ids)"

  if [[ -z "$ids_on_page" ]]; then
    echo "  WARNING: No IDs found on this page. Stopping early."
    break
  fi

  for notice_id in $ids_on_page; do
    json_file="${OUT_DIR}/${notice_id}.json"
    json_url="https://www.thegazette.co.uk/notice/${notice_id}/data.json?view=linked-data"

    if [[ -s "$json_file" ]]; then
      echo "  [skip] $notice_id (already downloaded)"
      continue
    fi

    echo "  [get ] $notice_id → $json_file"
    if ! curl -sS -A "$UA" "$json_url" -o "$json_file"; then
      echo "    ERROR downloading $json_url" >&2
      rm -f "$json_file" || true
    fi
    sleep "$SLEEP_BETWEEN_JSON"
  done

  sleep "$SLEEP_BETWEEN_PAGES"
done

echo
echo "Done. JSON files are in: $OUT_DIR"
