import os import re from bs4 import BeautifulSoup archive_path = '/srv/files/rotary_archive/1131/Stories' money_pattern = re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?') # Keywords that suggest an actual donation/impact impact_keywords = ['donate', 'donation', 'presented', 'bursary', 'cheque', 'contribution', 'gave', 'support'] # Words to avoid (ticket sales, meal costs, etc.) blacklist = ['ticket', 'admission', 'each', 'per person', 'dinner', 'raffle winner', '1st prize'] total_donations = 0.0 entries_found = [] seen_impacts = set() for root, dirs, files in os.walk(archive_path): for file in files: if file.endswith(".html"): file_path = os.path.join(root, file) with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: soup = BeautifulSoup(f, 'html.parser') # Target the main story body content_area = soup.find('div', class_='StoryContent') or soup.find('div', id='content') or soup for p in content_area.find_all(['p', 'div', 'li']): text = p.get_text().strip().lower() if '$' in text: # 1. Check for impact keywords has_keyword = any(word in text for word in impact_keywords) # 2. Check for blacklisted sales words is_sale = any(word in text for word in blacklist) if has_keyword and not is_sale: matches = money_pattern.findall(p.get_text()) for match in matches: val = float(match.replace('$', '').replace(',', '')) # Create a unique ID based on the amount and the first 30 chars of text # to prevent double-counting the same story's brief vs full text unique_id = f"{val}-{text[:30]}" if val > 1.0 and unique_id not in seen_impacts: total_donations += val clean_text = " ".join(p.get_text().split())[:100] entries_found.append(f"{match} | {clean_text}...") seen_impacts.add(unique_id) print("\n--- ROTARY COMMUNITY IMPACT REPORT ---") if not entries_found: print("No matching donations found with the current keywords.") else: for entry in entries_found: print(entry) print("--------------------------------------") print(f"ESTIMATED TOTAL IMPACT: ${total_donations:,.2f}")