"""Find which JSON source matches MongoDB data."""
from services.mongo import MongoService
from services.json_loader import JSONLoader
from services.settings_service import SettingsService
from services.json_flattener import JSONFlattener

s = SettingsService()
config = s.config

m = MongoService()
m.connect()
c = m.get_collection('people')

# Get all DB codes
f = JSONFlattener()
db_codes = set()
for r in c.find({'code': {'$exists': True}}):
    flat = f.flatten(r)
    if flat.get('code'):
        db_codes.add(str(flat.get('code')))

print(f"MongoDB codes: {len(db_codes)}")
print(f"Sample: {list(db_codes)[:5]}")

# Check each source
loader = JSONLoader(config['json_sources']['base_path'], config['collections'])
sources = ['ba_glf_2018', 'nanjing_2014', 'paris_2024', 'rio_2016', 'tokyo_2020']

for source in sources:
    try:
        json_records = loader.load_collection_from_source('people', source)
        json_codes = set(str(r.get('code')) for r in json_records if r.get('code'))
        overlap = db_codes & json_codes
        print(f"\n{source}:")
        print(f"  JSON codes: {len(json_codes)}")
        print(f"  Overlap with DB: {len(overlap)}")
        if overlap:
            print(f"  Sample overlap: {list(overlap)[:3]}")
    except Exception as e:
        print(f"\n{source}: Error - {e}")

# Test comparison with ba_glf_2018 (the source that matches)
print("\n" + "="*60)
print("Testing comparison with ba_glf_2018...")
from services.diff_engine import DiffEngine

diff = DiffEngine(config['collections'])
json_records = loader.load_collection_from_source('people', 'ba_glf_2018')
db_records = list(c.find({'code': {'$exists': True}}))

results = diff.compare_collections('people', json_records, db_records)
print(f"\nResults:")
print(f"  Exact matches: {results['summary']['exact_match_count']}")
print(f"  Duplicates: {results['summary']['duplicate_count']}")
print(f"  New records: {results['summary']['new_record_count']}")

# Analyze duplicates - find why they're not exact matches
if results['duplicates']:
    print(f"\nAnalyzing why {len(results['duplicates'])} records are duplicates, not exact matches...")
    
    # Check first few duplicates
    for i, dup in enumerate(results['duplicates'][:3]):
        jr = dup.get('json_record', {})
        dr = dup.get('db_record', {})
        
        print(f"\n  Duplicate {i+1} (code: {jr.get('code')}):")
        
        # Compute hashes
        json_hash = diff._compute_hash(jr)
        db_hash = diff._compute_hash(dr)
        
        if json_hash == db_hash:
            print(f"    ⚠️ Hashes MATCH but still marked as duplicate!")
        else:
            print(f"    Hashes differ")
            
            # Find what's different after normalization
            excluded_prefixes = ('codes', 'created_at', 'updated_at')
            def should_exclude(key):
                if key.startswith('_'):
                    return True
                for prefix in excluded_prefixes:
                    if key == prefix or key.startswith(prefix + '.'):
                        return True
                return False
            
            jr_clean = {k: v for k, v in jr.items() if not should_exclude(k)}
            dr_clean = {k: v for k, v in dr.items() if not should_exclude(k)}
            
            only_json = set(jr_clean.keys()) - set(dr_clean.keys())
            only_db = set(dr_clean.keys()) - set(jr_clean.keys())
            
            if only_json:
                print(f"    Only in JSON: {list(only_json)[:3]}")
            if only_db:
                print(f"    Only in DB: {list(only_db)[:3]}")
