import os
import re
from bs4 import BeautifulSoup

archive_path = '/srv/files/rotary_archive/1131/Stories'
money_pattern = re.compile(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?')

# Keywords that suggest an actual donation/impact
impact_keywords = ['donate', 'donation', 'presented', 'bursary', 'cheque', 'contribution', 'gave', 'support']
# Words to avoid (ticket sales, meal costs, etc.)
blacklist = ['ticket', 'admission', 'each', 'per person', 'dinner', 'raffle winner', '1st prize']

total_donations = 0.0
entries_found = []
seen_impacts = set()

for root, dirs, files in os.walk(archive_path):
    for file in files:
        if file.endswith(".html"):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                soup = BeautifulSoup(f, 'html.parser')
                # Target the main story body
                content_area = soup.find('div', class_='StoryContent') or soup.find('div', id='content') or soup
                
                for p in content_area.find_all(['p', 'div', 'li']):
                    text = p.get_text().strip().lower()
                    if '$' in text:
                        # 1. Check for impact keywords
                        has_keyword = any(word in text for word in impact_keywords)
                        # 2. Check for blacklisted sales words
                        is_sale = any(word in text for word in blacklist)
                        
                        if has_keyword and not is_sale:
                            matches = money_pattern.findall(p.get_text())
                            for match in matches:
                                val = float(match.replace('$', '').replace(',', ''))
                                # Create a unique ID based on the amount and the first 30 chars of text 
                                # to prevent double-counting the same story's brief vs full text
                                unique_id = f"{val}-{text[:30]}"
                                
                                if val > 1.0 and unique_id not in seen_impacts:
                                    total_donations += val
                                    clean_text = " ".join(p.get_text().split())[:100]
                                    entries_found.append(f"{match} | {clean_text}...")
                                    seen_impacts.add(unique_id)

print("\n--- ROTARY COMMUNITY IMPACT REPORT ---")
if not entries_found:
    print("No matching donations found with the current keywords.")
else:
    for entry in entries_found:
        print(entry)
print("--------------------------------------")
print(f"ESTIMATED TOTAL IMPACT: ${total_donations:,.2f}")