From b8ae10990a4a9e06b2e72f871db2b54571050701 Mon Sep 17 00:00:00 2001
From: rmcdermo <randy.mcdermott@gmail.com>
Date: Wed, 31 Dec 2025 11:50:02 -0500
Subject: [PATCH] Bibliography: add script to automatically add doi to bib
 files

---
 Manuals/Bibliography/btac_add_doi.py | 217 +++++++++++++++++++++++++++
 Manuals/Bibliography/clean_bib.py    |  76 ----------
 2 files changed, 217 insertions(+), 76 deletions(-)
 create mode 100644 Manuals/Bibliography/btac_add_doi.py
 delete mode 100644 Manuals/Bibliography/clean_bib.py

diff --git a/Manuals/Bibliography/btac_add_doi.py b/Manuals/Bibliography/btac_add_doi.py
new file mode 100644
index 0000000000..f2940edf0f
--- /dev/null
+++ b/Manuals/Bibliography/btac_add_doi.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+btac_add_doi.py
+
+Normalize a BibTeX file so it is safe for bibtex-autocomplete (btac),
+then run btac to add DOI fields using Crossref.
+
+Chunking is OPTIONAL and disabled by default.
+
+Usage:
+    python btac_add_doi.py
+    python btac_add_doi.py input.bib
+    python btac_add_doi.py input.bib output_cleaned.bib
+    python btac_add_doi.py input.bib output_cleaned.bib --chunk-size 75
+
+Defaults:
+    input.bib          = FDS_general.bib
+    output_cleaned.bib = FDS_general_cleaned.bib
+    chunking           = OFF
+
+Final output:
+    <output_cleaned>.btac.bib
+
+Notes:
+- DOI lookup is performed using Crossref only.
+- Existing fields are not overwritten.
+- '--mark' is used so the script can be safely re-run.
+- Chunking may be enabled if rate limiting is encountered on large files.
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------
+# Cleaning logic
+# ---------------------------------------------------------------------
+def clean_bib(infile: Path, outfile: Path) -> None:
+    with open(infile, "r", encoding="utf8", errors="ignore") as f:
+        text = f.read()
+
+    # Normalize various forms of "unknown"
+    patterns_unknown = [
+        r'=\s*unknown',
+        r'=\s*"unknown"',
+        r"=\s*'unknown'",
+        r"=\s*''unknown''",
+    ]
+    for p in patterns_unknown:
+        text = re.sub(p, '= {unknown}', text)
+
+    # Fix empty assignments like: field = ,
+    text = re.sub(r'=\s*,', '= {unknown},', text)
+
+    # Wrap bare identifiers safely
+    NUMERIC_FIELDS = {"year", "volume", "number", "pages"}
+
+    def wrap_identifier(match):
+        field = match.group(1)
+        value = match.group(2)
+        field_l = field.lower()
+
+        if field_l in NUMERIC_FIELDS:
+            return match.group(0)
+
+        if value.startswith("{") or value.startswith('"'):
+            return match.group(0)
+
+        if value.isalpha() and value.islower():
+            return match.group(0)
+
+        if re.match(r'^[A-Za-z0-9._:-]+$', value):
+            return f"{field} = {{{value}}}"
+
+        return match.group(0)
+
+    text = re.sub(
+        r'(\w+)\s*=\s*([A-Za-z0-9._:-]+)',
+        wrap_identifier,
+        text
+    )
+
+    with open(outfile, "w", encoding="utf8") as f:
+        f.write(text)
+
+
+# ---------------------------------------------------------------------
+# BibTeX chunk helpers
+# ---------------------------------------------------------------------
+def split_bib_into_chunks(bibfile: Path, chunk_size: int):
+    with open(bibfile, "r", encoding="utf8") as f:
+        text = f.read()
+
+    entries = re.split(r'(?=@\w+{)', text)
+    entries = [e for e in entries if e.strip()]
+
+    chunks = []
+    for i in range(0, len(entries), chunk_size):
+        chunk_path = bibfile.with_name(
+            f"{bibfile.stem}_chunk_{i//chunk_size:03d}.bib"
+        )
+        with open(chunk_path, "w", encoding="utf8") as f:
+            f.write("".join(entries[i:i + chunk_size]))
+        chunks.append(chunk_path)
+
+    return chunks
+
+
+def merge_bib_files(bib_files, output_file: Path):
+    with open(output_file, "w", encoding="utf8") as out:
+        for bib in bib_files:
+            with open(bib, "r", encoding="utf8") as f:
+                out.write(f.read())
+
+
+# ---------------------------------------------------------------------
+# btac invocation
+# ---------------------------------------------------------------------
+def run_btac(bibfile: Path) -> None:
+    """
+    Manual equivalent:
+        btac --only-complete doi \
+             --filter-fields-by-entrytype all \
+             --only-query crossref \
+             --mark \
+             <bibfile>
+    """
+    cmd = [
+        "btac",
+        "--only-complete", "doi",
+        "--filter-fields-by-entrytype", "all",
+        "--only-query", "crossref",
+        "--mark",
+        str(bibfile),
+    ]
+
+    print("Running:", " ".join(cmd))
+    subprocess.run(cmd, check=True)
+
+
+# ---------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description="Clean a BibTeX file and add DOI fields using btac (Crossref)."
+    )
+    parser.add_argument(
+        "input_bib",
+        nargs="?",
+        default="FDS_general.bib",
+        help="Input BibTeX file (default: FDS_general.bib)",
+    )
+    parser.add_argument(
+        "output_cleaned",
+        nargs="?",
+        default="FDS_general_cleaned.bib",
+        help="Cleaned BibTeX file (default: FDS_general_cleaned.bib)",
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=0,
+        help="Optional chunk size for btac processing (default: no chunking)",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=int,
+        default=45,
+        help="Sleep time between chunks in seconds (default: 45)",
+    )
+
+    args = parser.parse_args()
+
+    input_bib = Path(args.input_bib)
+    output_cleaned = Path(args.output_cleaned)
+
+    if not input_bib.exists():
+        print(f"ERROR: input file not found: {input_bib}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Cleaning {input_bib} → {output_cleaned}")
+    clean_bib(input_bib, output_cleaned)
+
+    final_btac = output_cleaned.with_suffix(".btac.bib")
+
+    if args.chunk_size > 0:
+        print(f"\nRunning btac with chunking (size={args.chunk_size})")
+        chunks = split_bib_into_chunks(output_cleaned, args.chunk_size)
+        outputs = []
+
+        for i, chunk in enumerate(chunks, start=1):
+            print(f"\n[{i}/{len(chunks)}] btac on {chunk.name}")
+            run_btac(chunk)
+            outputs.append(chunk.with_suffix(".btac.bib"))
+
+            if i < len(chunks):
+                print(f"Sleeping {args.sleep} s...")
+                time.sleep(args.sleep)
+
+        merge_bib_files(outputs, final_btac)
+
+    else:
+        print("\nRunning btac without chunking")
+        run_btac(output_cleaned)
+
+    print("\nDone.")
+    print(f"Cleaned file:      {output_cleaned}")
+    print(f"DOI-enriched file: {final_btac}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Manuals/Bibliography/clean_bib.py b/Manuals/Bibliography/clean_bib.py
deleted file mode 100644
index ec9a2d80d0..0000000000
--- a/Manuals/Bibliography/clean_bib.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env python3
-import re
-import sys
-
-if len(sys.argv) != 3:
-    print("Usage: python clean_bib.py input.bib output.bib")
-    sys.exit(1)
-
-infile  = sys.argv[1]
-outfile = sys.argv[2]
-
-with open(infile, "r", encoding="utf8", errors="ignore") as f:
-    text = f.read()
-
-# -------------------------------------------------------------------
-# 1. Normalize various forms of "unknown"
-# -------------------------------------------------------------------
-patterns_unknown = [
-    r'=\s*unknown',
-    r'=\s*"unknown"',
-    r"=\s*'unknown'",
-    r"=\s*''unknown''",
-]
-
-for p in patterns_unknown:
-    text = re.sub(p, '= {unknown}', text)
-
-# -------------------------------------------------------------------
-# 2. Fix empty assignments like: field = ,
-# -------------------------------------------------------------------
-text = re.sub(r'=\s*,', '= {unknown},', text)
-
-# -------------------------------------------------------------------
-# 3. Wrap bare identifiers safely
-#    (avoid numeric fields and BibTeX string macros)
-# -------------------------------------------------------------------
-NUMERIC_FIELDS = {"year", "volume", "number", "pages"}
-
-def wrap_identifier(match):
-    field = match.group(1)
-    value = match.group(2)
-
-    field_l = field.lower()
-
-    # Skip numeric-only fields (valid bare values)
-    if field_l in NUMERIC_FIELDS:
-        return match.group(0)
-
-    # Leave already wrapped values alone
-    if value.startswith("{") or value.startswith('"'):
-        return match.group(0)
-
-    # Do not wrap BibTeX string macros (jan, feb, jfm, etc.)
-    if value.isalpha() and value.islower():
-        return match.group(0)
-
-    # Wrap simple bare identifiers
-    if re.match(r'^[A-Za-z0-9._:-]+$', value):
-        return f"{field} = {{{value}}}"
-
-    return match.group(0)
-
-text = re.sub(
-    r'(\w+)\s*=\s*([A-Za-z0-9._:-]+)',
-    wrap_identifier,
-    text
-)
-
-# -------------------------------------------------------------------
-# 4. Write cleaned output
-# -------------------------------------------------------------------
-with open(outfile, "w", encoding="utf8") as f:
-    f.write(text)
-
-print(f"Cleaned file written to {outfile}")
-