"""
Diff Engine Service

This module compares JSON records against MongoDB records to identify:
- Exact matches
- Potential duplicates (by business keys OR alternative keys)
- New records
- Field differences

Key principles:
- Config-driven matching rules
- Support for multiple matching strategies
- No automatic decisions
- Preserve all comparison metadata
"""

from typing import List, Dict, Any, Set, Tuple, Optional
import hashlib
import json
from services.json_flattener import JSONFlattener


class DiffEngine:
    """
    Service for comparing JSON records with MongoDB records.
    
    Identifies matches, duplicates, and new records using configurable
    business key matching rules with support for alternative keys.
    """
    
    def __init__(self, collections_config: Dict[str, Any]):
        """
        Initialize diff engine with collections configuration.
        
        Args:
            collections_config: Collections configuration from app_config.yaml
        """
        self.collections_config = collections_config
        self.flattener = JSONFlattener()
    
    def _normalize_value(self, value: Any) -> Any:
        """
        Normalize a value for comparison.
        
        Args:
            value: Value to normalize
            
        Returns:
            Normalized value
        """
        if value is None:
            return None
        
        # Boolean normalization (must check before int/float since bool is subclass of int)
        if isinstance(value, bool):
            return value
        
        # String boolean to actual boolean
        if isinstance(value, str) and value in ('True', 'False'):
            return value == 'True'
        
        # Numeric normalization (int/float)
        if isinstance(value, (int, float)):
            return float(value)
        
        return value
    
    def _compute_hash(self, record: Dict[str, Any]) -> str:
        """
        Compute hash of a record for exact matching.
        
        Args:
            record: Record dictionary (flattened or nested)
            
        Returns:
            SHA256 hash of record
        """
        # Field prefixes to exclude from hash comparison
        # These are added during import and should not affect exact match detection
        excluded_prefixes = (
            'codes',           # Added during people consolidation (codes, codes.0, codes.1, etc.)
            'created_at',      # Added during import
            'updated_at',      # Added during update
        )
        
        # Remove metadata fields (starting with _) and excluded fields before hashing
        def should_exclude(key):
            if key.startswith('_'):
                return True
            for prefix in excluded_prefixes:
                if key == prefix or key.startswith(prefix + '.'):
                    return True
            return False
        
        clean_record = {
            k: v for k, v in record.items() 
            if not should_exclude(k)
        }
        
        # Normalize types to avoid false mismatches
        # MongoDB stores numbers as float, JSON may have int
        # Boolean values may be stored as strings in some cases
        # Numeric strings should be converted to numbers
        normalized_record = {}
        for key, value in clean_record.items():
            if isinstance(value, bool):
                # Keep booleans as-is (must check before int/float since bool is subclass of int)
                normalized_record[key] = value
            elif isinstance(value, (int, float)):
                # Convert to string for consistent comparison (handles int vs str mismatch)
                normalized_record[key] = str(value)
            elif isinstance(value, str):
                # Handle string booleans
                if value in ('True', 'False'):
                    normalized_record[key] = value == 'True'
                # Handle numeric strings - keep as string for consistency
                elif value.isdigit() or (value.startswith('-') and value[1:].isdigit()):
                    normalized_record[key] = value
                else:
                    normalized_record[key] = value
            else:
                normalized_record[key] = value
        
        # Sort keys for consistent hashing
        record_str = json.dumps(normalized_record, sort_keys=True, default=str)
        return hashlib.sha256(record_str.encode()).hexdigest()
    
    def _extract_business_key(
        self,
        record: Dict[str, Any],
        keys: List[str]
    ) -> Optional[Tuple]:
        """
        Extract business key values from a record.
        
        Args:
            record: Record dictionary
            keys: List of key field names
            
        Returns:
            Tuple of key values or None if any key is missing
        """
        values = []
        
        for key in keys:
            value = record.get(key)
            
            # Handle None/null values
            if value is None:
                return None
            
            # Convert to string for comparison
            values.append(str(value))
        
        return tuple(values) if values else None
    
    def _get_all_fields(self, records: List[Dict[str, Any]]) -> Set[str]:
        """
        Get union of all fields across records.
        
        Args:
            records: List of records
            
        Returns:
            Set of all field names
        """
        all_fields = set()
        
        for record in records:
            all_fields.update(record.keys())
        
        return all_fields
    
    def _compare_records(
        self,
        json_record: Dict[str, Any],
        db_record: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Compare two records field by field.
        
        Args:
            json_record: Record from JSON
            db_record: Record from MongoDB
            
        Returns:
            Dictionary with comparison results
        """
        # Field prefixes to exclude from comparison (added during import)
        excluded_prefixes = (
            'codes',           # Added during people consolidation (codes, codes.0, codes.1, etc.)
            'created_at',      # Added during import
            'updated_at',      # Added during update
        )
        
        def should_exclude(key):
            if key.startswith('_') or key == '_id':
                return True
            for prefix in excluded_prefixes:
                if key == prefix or key.startswith(prefix + '.'):
                    return True
            return False
        
        all_fields = self._get_all_fields([json_record, db_record])
        
        differences = {}
        new_fields = []
        matching_fields = []
        
        for field in all_fields:
            json_value = json_record.get(field)
            db_value = db_record.get(field)
            
            # Skip excluded fields
            if should_exclude(field):
                continue
            
            if field not in db_record:
                # New field in JSON
                new_fields.append(field)
                differences[field] = {
                    'status': 'new',
                    'json_value': json_value,
                    'db_value': None
                }
            elif field not in json_record:
                # Field exists in DB but not in JSON
                differences[field] = {
                    'status': 'missing_in_json',
                    'json_value': None,
                    'db_value': db_value
                }
            else:
                # Normalize values for comparison
                normalized_json = self._normalize_value(json_value)
                normalized_db = self._normalize_value(db_value)
                
                if normalized_json != normalized_db:
                    # Different values
                    differences[field] = {
                        'status': 'different',
                        'json_value': json_value,
                        'db_value': db_value
                    }
                else:
                    # Matching values
                    matching_fields.append(field)
        
        return {
            'differences': differences,
            'new_fields': new_fields,
            'matching_fields': matching_fields,
            'difference_count': len(differences)
        }
    
    def detect_json_duplicates(
        self,
        collection_name: str,
        json_records: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """
        Detect duplicates within JSON data itself (before comparing with DB).
        For people collection, uses name + birthdate matching.
        
        Args:
            collection_name: Name of the collection
            json_records: Flattened records from JSON files
            
        Returns:
            Dictionary with duplicate groups and statistics
        """
        from collections import defaultdict
        
        config = self.collections_config.get(collection_name, {})
        business_keys = config.get('business_keys', [])
        
        # Special handling for people collection
        if collection_name == 'people':
            identity_groups = defaultdict(list)
            
            for idx, record in enumerate(json_records):
                given_name = str(record.get('given_name', '')).strip().lower()
                family_name = str(record.get('family_name', '')).strip().lower()
                birth_date = str(record.get('birth_date', '')).strip()
                
                if given_name and family_name:
                    identity_key = (given_name, family_name, birth_date)
                    identity_groups[identity_key].append({
                        'index': idx,
                        'record': record
                    })
            
            # Find duplicates
            duplicates = {k: v for k, v in identity_groups.items() if len(v) > 1}
            
            return {
                'has_duplicates': len(duplicates) > 0,
                'duplicate_count': len(duplicates),
                'total_duplicate_records': sum(len(v) for v in duplicates.values()),
                'duplicate_groups': duplicates,
                'unique_count': len(identity_groups) - len(duplicates)
            }
        
        # Generic duplicate detection by business keys
        if not business_keys:
            return {
                'has_duplicates': False,
                'duplicate_count': 0,
                'total_duplicate_records': 0,
                'duplicate_groups': {},
                'unique_count': len(json_records)
            }
        
        key_groups = defaultdict(list)
        
        for idx, record in enumerate(json_records):
            business_key = self._extract_business_key(record, business_keys)
            if business_key:
                key_groups[business_key].append({
                    'index': idx,
                    'record': record
                })
        
        duplicates = {k: v for k, v in key_groups.items() if len(v) > 1}
        
        return {
            'has_duplicates': len(duplicates) > 0,
            'duplicate_count': len(duplicates),
            'total_duplicate_records': sum(len(v) for v in duplicates.values()),
            'duplicate_groups': duplicates,
            'unique_count': len(key_groups) - len(duplicates)
        }
    
    def compare_collections(
        self,
        collection_name: str,
        json_records: List[Dict[str, Any]],
        db_records: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """
        Compare JSON records against MongoDB records with multiple matching strategies.
        
        This method:
        1. Detects duplicates within JSON data first
        2. Flattens MongoDB records if needed
        3. Identifies exact matches (by hash)
        4. Identifies potential duplicates (by primary OR alternative keys)
        5. Identifies new records
        6. Identifies new fields (ALL flattened keys)
        
        Matching Strategies:
        - primary_only: Match only by business_keys
        - primary_or_alternative: Try business_keys first, then alternative_keys
        - alternative_only: Match only by alternative_keys
        
        Args:
            collection_name: Name of the collection
            json_records: Flattened records from JSON files
            db_records: Records from MongoDB (will be flattened)
            
        Returns:
            Dictionary with comparison results including match_method for each duplicate
        """
        # First, detect duplicates within JSON data
        json_duplicates = self.detect_json_duplicates(collection_name, json_records)
        # Flatten MongoDB records if they're not already flat
        flattened_db_records = []
        for db_record in db_records:
            # Check if record is already flat (has dot notation keys)
            if any('.' in key for key in db_record.keys()):
                flattened_db_records.append(db_record)
            else:
                # Flatten the record
                flat = self.flattener.flatten(db_record)
                flattened_db_records.append(flat)
        
        # Get business keys and matching strategy from config
        collection_config = self.collections_config.get(collection_name, {})
        business_keys = collection_config.get('business_keys', [])
        alternative_keys = collection_config.get('alternative_keys', [])
        matching_strategy = collection_config.get('matching_strategy', 'primary_only')
        
        if not business_keys:
            # No business keys defined - can't detect duplicates
            return {
                'exact_matches': [],
                'duplicates': [],
                'new_records': [],
                'new_fields': [],
                'summary': {
                    'exact_match_count': 0,
                    'duplicate_count': 0,
                    'new_record_count': len(json_records),
                    'new_field_count': 0,
                    'primary_key_matches': 0,
                    'alternative_key_matches': 0
                },
                'matching_keys': [],
                'alternative_keys': [],
                'matching_strategy': matching_strategy,
                'json_count': len(json_records),
                'db_count': len(db_records),
                'collection_name': collection_name
            }
        
        # Build hash index for exact matching
        db_by_hash = {}
        for db_record in flattened_db_records:
            record_hash = self._compute_hash(db_record)
            db_by_hash[record_hash] = db_record
        
        # Build indexes of DB records by business key and alternative keys
        db_by_primary_key = {}
        db_by_alternative_key = {}
        
        for db_record in flattened_db_records:
            # Index by primary business keys
            primary_key = self._extract_business_key(db_record, business_keys)
            if primary_key:
                db_by_primary_key[primary_key] = db_record
            
            # Index by alternative keys if configured
            if alternative_keys and matching_strategy in ['primary_or_alternative', 'alternative_only']:
                alt_key = self._extract_business_key(db_record, alternative_keys)
                if alt_key:
                    # Store with metadata about which record it is
                    if alt_key not in db_by_alternative_key:
                        db_by_alternative_key[alt_key] = []
                    db_by_alternative_key[alt_key].append(db_record)
        
        # Categorize JSON records
        exact_matches = []
        duplicates = []
        new_records = []
        matched_json_hashes = set()
        
        # Compare each JSON record with multiple matching strategies
        for json_record in json_records:
            json_hash = self._compute_hash(json_record)
            
            # Check for exact match first
            if json_hash in db_by_hash:
                exact_matches.append(json_record)
                matched_json_hashes.add(json_hash)
                continue
            
            # Try matching by business keys
            matched = False
            match_method = None
            matched_db_record = None
            matched_key = None
            
            # Strategy 1: Try primary business keys
            if matching_strategy in ['primary_only', 'primary_or_alternative']:
                json_primary_key = self._extract_business_key(json_record, business_keys)
                if json_primary_key and json_primary_key in db_by_primary_key:
                    matched = True
                    match_method = 'primary_key'
                    matched_db_record = db_by_primary_key[json_primary_key]
                    matched_key = json_primary_key
            
            # Strategy 2: Try alternative keys if primary didn't match
            if not matched and matching_strategy in ['primary_or_alternative', 'alternative_only']:
                if alternative_keys:
                    json_alt_key = self._extract_business_key(json_record, alternative_keys)
                    if json_alt_key and json_alt_key in db_by_alternative_key:
                        # Alternative key matched - could be multiple matches
                        alt_matches = db_by_alternative_key[json_alt_key]
                        if len(alt_matches) == 1:
                            # Single match - safe to use
                            matched = True
                            match_method = 'alternative_key'
                            matched_db_record = alt_matches[0]
                            matched_key = json_alt_key
                        else:
                            # Multiple matches - ambiguous, treat as new for safety
                            # Log this for user awareness
                            pass
            
            if matched and matched_db_record:
                # Found a duplicate
                comparison = self._compare_records(json_record, matched_db_record)
                
                duplicates.append({
                    'json_record': json_record,
                    'db_record': matched_db_record,
                    'comparison': comparison,
                    'matching_key': matched_key,
                    'match_method': match_method  # 'primary_key' or 'alternative_key'
                })
                matched_json_hashes.add(json_hash)
            else:
                # New record - no match found
                json_primary_key = self._extract_business_key(json_record, business_keys)
                new_records.append({
                    'json_record': json_record,
                    'business_key': json_primary_key
                })
        
        # Identify new fields across all records
        # This now includes ALL flattened keys from JSON
        json_fields = self._get_all_fields(json_records)
        db_fields = self._get_all_fields(flattened_db_records)
        
        # Exclude metadata fields from new fields detection
        json_data_fields = {f for f in json_fields if not f.startswith('_')}
        db_data_fields = {f for f in db_fields if not f.startswith('_')}
        all_new_fields = json_data_fields - db_data_fields
        
        # Count matches by method
        primary_matches = sum(1 for d in duplicates if d.get('match_method') == 'primary_key')
        alternative_matches = sum(1 for d in duplicates if d.get('match_method') == 'alternative_key')
        
        return {
            'exact_matches': exact_matches,
            'duplicates': duplicates,
            'new_records': new_records,
            'new_fields': list(all_new_fields),
            'json_duplicates': json_duplicates,  # Duplicates within JSON data
            'summary': {
                'exact_match_count': len(exact_matches),
                'duplicate_count': len(duplicates),
                'new_record_count': len(new_records),
                'new_field_count': len(all_new_fields),
                'primary_key_matches': primary_matches,
                'alternative_key_matches': alternative_matches,
                'json_duplicate_count': json_duplicates.get('duplicate_count', 0),
                'json_duplicate_records': json_duplicates.get('total_duplicate_records', 0)
            },
            'matching_keys': business_keys,
            'alternative_keys': alternative_keys,
            'matching_strategy': matching_strategy,
            'json_count': len(json_records),
            'db_count': len(db_records),
            'collection_name': collection_name
        }
    
    def get_matching_keys(self, collection_name: str) -> List[str]:
        """
        Get the business key matching rules for a collection from config.
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            List of field names used for matching
        """
        collection_config = self.collections_config.get(collection_name, {})
        return collection_config.get('business_keys', [])