import os
from bs4 import BeautifulSoup

# Define your paths - Note the exact folder name from your ls output
httrack_path = '/srv/files/rotary_archive'
facebook_path = '/srv/files/Facebook Archive' 
output_file = '/srv/files/rotary_full_dump.txt'

def extract_all_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f, 'html.parser')
            
            # Remove scripts and styles
            for script_or_style in soup(["script", "style"]):
                script_or_style.decompose()

            # Get every bit of text
            text = soup.get_text(separator=' ')
            
            # Cleanup excessive blank lines
            lines = (line.strip() for line in text.splitlines())
            return "\n".join(line for line in lines if line)
            
    except Exception as e:
        return f"\n[ERROR READING {file_path}: {e}]\n"

print("Starting full dump...")

with open(output_file, 'w', encoding='utf-8') as out:
    # Process HTTrack/ClubRunner Files
    if os.path.exists(httrack_path):
        print(f"Processing Rotary Web Archive...")
        for root, dirs, files in os.walk(httrack_path):
            for file in files:
                if file.endswith(".html"):
                    full_path = os.path.join(root, file)
                    out.write(f"\n\n{'='*30}\nFILE: {full_path}\n{'='*30}\n\n")
                    out.write(extract_all_text(full_path))

    # Process Facebook Files
    if os.path.exists(facebook_path):
        print(f"Processing Facebook Archive...")
        for root, dirs, files in os.walk(facebook_path):
            for file in files:
                if file.endswith(".html"):
                    full_path = os.path.join(root, file)
                    out.write(f"\n\n{'='*30}\nFB SOURCE: {full_path}\n{'='*30}\n\n")
                    out.write(extract_all_text(full_path))
    else:
        print(f"Warning: Facebook path not found at {facebook_path}")

print(f"\nComplete! The full dump is located at: {output_file}")