import os
from bs4 import BeautifulSoup

# Target ONLY the Interact Facebook Archive
target_folder = '/srv/files/Interact Facebook Archive'
output_file = '/srv/files/interact_full_dump.txt'

def extract_all_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            soup = BeautifulSoup(f, 'html.parser')
            
            # Remove scripts and styles
            for script_or_style in soup(["script", "style"]):
                script_or_style.decompose()

            # Get every bit of text
            text = soup.get_text(separator=' ')
            
            # Cleanup excessive blank lines
            lines = (line.strip() for line in text.splitlines())
            return "\n".join(line for line in lines if line)
            
    except Exception as e:
        return f"\n[ERROR READING {file_path}: {e}]\n"

print(f"Starting text extraction for: {target_folder}...")

with open(output_file, 'w', encoding='utf-8') as out:
    if os.path.exists(target_folder):
        for root, dirs, files in os.walk(target_folder):
            for file in files:
                if file.endswith(".html"):
                    full_path = os.path.join(root, file)
                    out.write(f"\n\n{'='*30}\nSOURCE: {full_path}\n{'='*30}\n\n")
                    out.write(extract_all_text(full_path))
        print(f"\nComplete! The Interact text dump is located at: {output_file}")
    else:
        print(f"Error: Could not find the folder at {target_folder}. Check the spelling and spaces.")