"""
Debug People Collection Issue

Check why people records show as duplicates with many field differences.
"""

from services.mongo import MongoService
from services.json_loader import JSONLoader
from services.diff_engine import DiffEngine
from services.settings_service import SettingsService
from services.json_flattener import JSONFlattener
import hashlib
import json

def debug_people():
    """Debug why people records show field differences."""
    print("=" * 80)
    print("Debug People Collection Issue")
    print("=" * 80)
    
    # Initialize services
    settings_service = SettingsService()
    config = settings_service.config
    
    mongo_service = MongoService()
    if not mongo_service.connect():
        print("❌ Failed to connect")
        return
    
    loader = JSONLoader(
        config['json_sources']['base_path'],
        config['collections']
    )
    
    diff_engine = DiffEngine(config['collections'])
    flattener = JSONFlattener()
    
    # Load data
    collection_name = 'people'
    source_name = 'tokyo_2020'
    
    print(f"\n1. Loading JSON from {source_name}...")
    json_records = loader.load_collection_from_source(collection_name, source_name)
    print(f"   Loaded {len(json_records)} JSON records")
    
    print(f"\n2. Loading MongoDB data...")
    collection = mongo_service.get_collection(collection_name)
    db_records = list(collection.find().limit(1000))
    print(f"   Loaded {len(db_records)} MongoDB records")
    
    # Run comparison
    print(f"\n3. Running comparison...")
    results = diff_engine.compare_collections(
        collection_name,
        json_records,
        db_records
    )
    
    print(f"\n4. Results:")
    print(f"   Exact matches: {results['summary']['exact_match_count']}")
    print(f"   Duplicates: {results['summary']['duplicate_count']}")
    print(f"   New records: {results['summary']['new_record_count']}")
    
    # Check business keys
    print(f"\n5. Business keys used: {results['matching_keys']}")
    
    # Analyze why no matches
    if results['new_records'] and len(results['new_records']) > 0:
        print(f"\n6. Analyzing why records don't match...")
        
        # Get first JSON record
        json_rec = results['new_records'][0]
        if isinstance(json_rec, dict) and 'json_record' in json_rec:
            json_rec = json_rec['json_record']
        
        print(f"\n   Sample JSON record business keys:")
        for key in results['matching_keys']:
            val = json_rec.get(key)
            print(f"     {key}: '{val}' (type: {type(val).__name__})")
        
        # Find a DB record with same name
        given_name = json_rec.get('given_name')
        family_name = json_rec.get('family_name')
        birth_date = json_rec.get('birth_date')
        
        print(f"\n   Looking for DB record with:")
        print(f"     given_name: '{given_name}'")
        print(f"     family_name: '{family_name}'")
        print(f"     birth_date: '{birth_date}'")
        
        # Search in flattened DB records
        for db_doc in db_records[:100]:
            flat_db = flattener.flatten(db_doc)
            db_given = flat_db.get('given_name')
            db_family = flat_db.get('family_name')
            db_birth = flat_db.get('birth_date')
            
            # Check if names match (case insensitive)
            if db_given and given_name:
                if db_given.lower() == given_name.lower() and db_family and family_name and db_family.lower() == family_name.lower():
                    print(f"\n   ✅ Found potential match in DB:")
                    print(f"     given_name: '{db_given}' (type: {type(db_given).__name__})")
                    print(f"     family_name: '{db_family}' (type: {type(db_family).__name__})")
                    print(f"     birth_date: '{db_birth}' (type: {type(db_birth).__name__})")
                    
                    # Check exact match
                    print(f"\n   Exact comparison:")
                    print(f"     given_name match: {db_given == given_name}")
                    print(f"     family_name match: {db_family == family_name}")
                    print(f"     birth_date match: {db_birth == birth_date}")
                    
                    if db_birth != birth_date:
                        print(f"\n   ⚠️ Birth date mismatch!")
                        print(f"     JSON: '{birth_date}' (repr: {repr(birth_date)})")
                        print(f"     DB:   '{db_birth}' (repr: {repr(db_birth)})")
                    break
    
    # Analyze duplicates
    if results['duplicates']:
        print(f"\n7. Analyzing first duplicate...")
        dup = results['duplicates'][0]
        
        json_rec = dup.get('json_record', {})
        db_rec = dup.get('db_record', {})
        
        print(f"\n   JSON record keys ({len(json_rec)}):")
        json_keys = set(k for k in json_rec.keys() if not k.startswith('_'))
        print(f"   {sorted(list(json_keys)[:15])}...")
        
        print(f"\n   DB record keys ({len(db_rec)}):")
        db_keys = set(k for k in db_rec.keys() if not k.startswith('_'))
        print(f"   {sorted(list(db_keys)[:15])}...")
        
        # Fields only in JSON
        json_only = json_keys - db_keys
        print(f"\n   Fields ONLY in JSON ({len(json_only)}):")
        for k in sorted(list(json_only)[:10]):
            print(f"     - {k}: {json_rec.get(k)}")
        
        # Fields only in DB
        db_only = db_keys - json_keys
        print(f"\n   Fields ONLY in DB ({len(db_only)}):")
        for k in sorted(list(db_only)[:10]):
            print(f"     - {k}: {db_rec.get(k)}")
        
        # Different values
        common_keys = json_keys & db_keys
        different = []
        for k in common_keys:
            jv = json_rec.get(k)
            dv = db_rec.get(k)
            if jv != dv:
                different.append((k, jv, dv))
        
        print(f"\n   Fields with DIFFERENT values ({len(different)}):")
        for k, jv, dv in different[:10]:
            print(f"     - {k}:")
            print(f"         JSON: {jv} ({type(jv).__name__})")
            print(f"         DB:   {dv} ({type(dv).__name__})")
        
        # Check if 'codes' is causing issues
        print(f"\n7. Checking 'codes' field specifically:")
        print(f"   JSON has 'codes': {'codes' in json_rec}")
        print(f"   DB has 'codes': {'codes' in db_rec}")
        if 'codes' in db_rec:
            print(f"   DB 'codes' value: {db_rec.get('codes')}")
        
        # Check 'code' field
        print(f"\n8. Checking 'code' field:")
        print(f"   JSON 'code': {json_rec.get('code')}")
        print(f"   DB 'code': {db_rec.get('code')}")
    
    print("\n" + "=" * 80)

if __name__ == "__main__":
    try:
        debug_people()
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()
