import os from bs4 import BeautifulSoup # Define your paths - Note the exact folder name from your ls output httrack_path = '/srv/files/rotary_archive' facebook_path = '/srv/files/Facebook Archive' output_file = '/srv/files/rotary_full_dump.txt' def extract_all_text(file_path): try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f, 'html.parser') # Remove scripts and styles for script_or_style in soup(["script", "style"]): script_or_style.decompose() # Get every bit of text text = soup.get_text(separator=' ') # Cleanup excessive blank lines lines = (line.strip() for line in text.splitlines()) return "\n".join(line for line in lines if line) except Exception as e: return f"\n[ERROR READING {file_path}: {e}]\n" print("Starting full dump...") with open(output_file, 'w', encoding='utf-8') as out: # Process HTTrack/ClubRunner Files if os.path.exists(httrack_path): print(f"Processing Rotary Web Archive...") for root, dirs, files in os.walk(httrack_path): for file in files: if file.endswith(".html"): full_path = os.path.join(root, file) out.write(f"\n\n{'='*30}\nFILE: {full_path}\n{'='*30}\n\n") out.write(extract_all_text(full_path)) # Process Facebook Files if os.path.exists(facebook_path): print(f"Processing Facebook Archive...") for root, dirs, files in os.walk(facebook_path): for file in files: if file.endswith(".html"): full_path = os.path.join(root, file) out.write(f"\n\n{'='*30}\nFB SOURCE: {full_path}\n{'='*30}\n\n") out.write(extract_all_text(full_path)) else: print(f"Warning: Facebook path not found at {facebook_path}") print(f"\nComplete! The full dump is located at: {output_file}")