from docx import Document
import json
from pathlib import Path
import hashlib

# --------------------------------------------------
# CONFIGURATION
# --------------------------------------------------

DOCS_DIR = Path("docs")          # Folder containing .docx files
OUTPUT_DIR = Path("docx_data")
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_FILE = OUTPUT_DIR / "docx_raw.json"

# --------------------------------------------------
# DOCX TEXT EXTRACTION
# --------------------------------------------------

def extract_docx_text(docx_path: Path) -> str:
    """
    Extract clean text from a DOCX file
    """
    doc = Document(docx_path)
    paragraphs = []

    for p in doc.paragraphs:
        text = p.text.strip()
        if text:
            paragraphs.append(text)

    return "\n".join(paragraphs)

# --------------------------------------------------
# PROCESS ALL DOCX FILES
# --------------------------------------------------

def process_docx_folder(folder: Path):
    records = []

    for docx_file in folder.glob("*.docx"):
        text = extract_docx_text(docx_file)

        if len(text) < 200:
            print(f"[SKIP] Too little content: {docx_file.name}")
            continue

        doc_id = hashlib.sha256(
            f"docx|{docx_file.name}".encode("utf-8")
        ).hexdigest()

        records.append({
            "id": doc_id,
            "url": f"docx://{docx_file.name}",
            "text": text,
            "source_type": "docx"
        })

        print(f"[OK] Processed: {docx_file.name}")

    return records

# --------------------------------------------------
# EXECUTION
# --------------------------------------------------

if __name__ == "__main__":
    if not DOCS_DIR.exists():
        raise FileNotFoundError("docs/ folder not found")

    data = process_docx_folder(DOCS_DIR)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print("\n----------------------------------")
    print(f"Total DOCX files processed: {len(data)}")
    print(f"Saved DOCX JSON to: {OUTPUT_FILE}")
    print("----------------------------------\n")

    if data:
        print("Sample document:")
        print("URL:", data[0]["url"])
        print("Text preview:\n")
        print(data[0]["text"][:500])
