import os from bs4 import BeautifulSoup # Target ONLY the Interact Facebook Archive target_folder = '/srv/files/Interact Facebook Archive' output_file = '/srv/files/interact_full_dump.txt' def extract_all_text(file_path): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f, 'html.parser') # Remove scripts and styles for script_or_style in soup(["script", "style"]): script_or_style.decompose() # Get every bit of text text = soup.get_text(separator=' ') # Cleanup excessive blank lines lines = (line.strip() for line in text.splitlines()) return "\n".join(line for line in lines if line) except Exception as e: return f"\n[ERROR READING {file_path}: {e}]\n" print(f"Starting text extraction for: {target_folder}...") with open(output_file, 'w', encoding='utf-8') as out: if os.path.exists(target_folder): for root, dirs, files in os.walk(target_folder): for file in files: if file.endswith(".html"): full_path = os.path.join(root, file) out.write(f"\n\n{'='*30}\nSOURCE: {full_path}\n{'='*30}\n\n") out.write(extract_all_text(full_path)) print(f"\nComplete! The Interact text dump is located at: {output_file}") else: print(f"Error: Could not find the folder at {target_folder}. Check the spelling and spaces.")