o
    ג[ig7                     @   sR   d Z ddlmZmZmZmZmZmZ ddlZddl	Z	ddl
mZ G dd dZdS )ad  
Diff Engine Service

This module compares JSON records against MongoDB records to identify:
- Exact matches
- Potential duplicates (by business keys OR alternative keys)
- New records
- Field differences

Key principles:
- Config-driven matching rules
- Support for multiple matching strategies
- No automatic decisions
- Preserve all comparison metadata
    )ListDictAnySetTupleOptionalN)JSONFlattenerc                
   @   s  e Zd ZdZdeeef fddZdeeef defddZdeeef d	e	e de
e fd
dZde	eeef  dee fddZdeeef deeef deeef fddZdede	eeef  de	eeef  deeef fddZdede	e fddZdS )
DiffEnginez
    Service for comparing JSON records with MongoDB records.
    
    Identifies matches, duplicates, and new records using configurable
    business key matching rules with support for alternative keys.
    collections_configc                 C   s   || _ t | _dS )z
        Initialize diff engine with collections configuration.
        
        Args:
            collections_config: Collections configuration from app_config.yaml
        N)r
   r   	flattener)selfr
    r   0/var/www/html/IGF-ODF-V3/services/diff_engine.py__init__   s   zDiffEngine.__init__recordreturnc                 C   s4   dd |  D }tj|dtd}t|  S )z
        Compute hash of a record for exact matching.
        
        Args:
            record: Record dictionary (flattened or nested)
            
        Returns:
            SHA256 hash of record
        c                 S   s    i | ]\}}| d s||qS _
startswith).0kvr   r   r   
<dictcomp>4   s     z,DiffEngine._compute_hash.<locals>.<dictcomp>T)	sort_keysdefault)itemsjsondumpsstrhashlibsha256encode	hexdigest)r   r   clean_record
record_strr   r   r   _compute_hash)   s   zDiffEngine._compute_hashkeysc                 C   sD   g }|D ]}| |}|du r dS |t| q|r t|S dS )z
        Extract business key values from a record.
        
        Args:
            record: Record dictionary
            keys: List of key field names
            
        Returns:
            Tuple of key values or None if any key is missing
        N)getappendr   tuple)r   r   r'   valueskeyvaluer   r   r   _extract_business_key9   s   
z DiffEngine._extract_business_keyrecordsc                 C   s"   t  }|D ]	}||  q|S )z
        Get union of all fields across records.
        
        Args:
            records: List of records
            
        Returns:
            Set of all field names
        )setupdater'   )r   r/   
all_fieldsr   r   r   r   _get_all_fieldsV   s   
zDiffEngine._get_all_fieldsjson_record	db_recordc           
      C   s   |  ||g}i }g }g }|D ]B}||}||}	|dkr q||vr2|| d|dd||< q||vr?dd|	d||< q||	krLd||	d||< q|| q|||t|dS )z
        Compare two records field by field.
        
        Args:
            json_record: Record from JSON
            db_record: Record from MongoDB
            
        Returns:
            Dictionary with comparison results
        _idnewN)status
json_valuedb_valuemissing_in_json	different)differences
new_fieldsmatching_fieldsdifference_count)r3   r(   r)   len)
r   r4   r5   r2   r=   r>   r?   fieldr9   r:   r   r   r   _compare_recordsg   s>   


zDiffEngine._compare_recordscollection_namejson_records
db_recordsc           &      C   s  g }|D ]}t dd | D r|| q| j|}|| q| j|i }|dg }|dg }	|dd}
|sYg g g g ddt|ddddg g |
t|t||d	S i }|D ]}| |}|||< q]i }i }|D ]+}| 	||}|r}|||< |	r|
d
v r| 	||	}|r||vrg ||< || | qog }g }g }t
 }|D ]}| |}||v r|| || qd}d}d}d}|
dv r| 	||}|r||v rd}d}|| }|}|s|
d
v r|	r| 	||	}|r||v r|| }t|dkrd}d}|d }|}n	 |r+|r+| ||}||||||d || q| 	||}|||d q| |}| |} dd |D }!dd | D }"|!|" }#tdd |D }$tdd |D }%|||t|#t|t|t|t|#|$|%d||	|
t|t||d	S )a  
        Compare JSON records against MongoDB records with multiple matching strategies.
        
        This method:
        1. Flattens MongoDB records if needed
        2. Identifies exact matches (by hash)
        3. Identifies potential duplicates (by primary OR alternative keys)
        4. Identifies new records
        5. Identifies new fields (ALL flattened keys)
        
        Matching Strategies:
        - primary_only: Match only by business_keys
        - primary_or_alternative: Try business_keys first, then alternative_keys
        - alternative_only: Match only by alternative_keys
        
        Args:
            collection_name: Name of the collection
            json_records: Flattened records from JSON files
            db_records: Records from MongoDB (will be flattened)
            
        Returns:
            Dictionary with comparison results including match_method for each duplicate
        c                 s   s    | ]}d |v V  qdS ).Nr   )r   r,   r   r   r   	<genexpr>   s    z1DiffEngine.compare_collections.<locals>.<genexpr>business_keysalternative_keysmatching_strategyprimary_onlyr   )exact_match_countduplicate_countnew_record_countnew_field_countprimary_key_matchesalternative_key_matches)exact_matches
duplicatesnew_recordsr>   summarymatching_keysrJ   rK   
json_countdb_countrD   )primary_or_alternativealternative_onlyFN)rL   rZ   Tprimary_key   alternative_key)r4   r5   
comparisonmatching_keymatch_method)r4   business_keyc                 S      h | ]	}| d s|qS r   r   r   fr   r   r   	<setcomp>N      z1DiffEngine.compare_collections.<locals>.<setcomp>c                 S   rc   r   r   rd   r   r   r   rf   O  rg   c                 s   "    | ]}| d dkrdV  qdS )ra   r\   r]   Nr(   r   dr   r   r   rH   S       c                 s   rh   )ra   r^   r]   Nri   rj   r   r   r   rH   T  rl   )anyr'   r)   r   flattenr
   r(   rA   r&   r.   r0   addrC   r3   sumlist)&r   rD   rE   rF   flattened_db_recordsr5   flatcollection_configrI   rJ   rK   
db_by_hashrecord_hashdb_by_primary_keydb_by_alternative_keyr\   alt_keyrS   rT   rU   matched_json_hashesr4   	json_hashmatchedra   matched_db_recordmatched_keyjson_primary_keyjson_alt_keyalt_matchesr_   json_fields	db_fieldsjson_data_fieldsdb_data_fieldsall_new_fieldsprimary_matchesalternative_matchesr   r   r   compare_collections   s   







zDiffEngine.compare_collectionsc                 C   s   | j |i }|dg S )z
        Get the business key matching rules for a collection from config.
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            List of field names used for matching
        rI   )r
   r(   )r   rD   rt   r   r   r   get_matching_keysk  s   
zDiffEngine.get_matching_keysN)__name__
__module____qualname____doc__r   r   r   r   r&   r   r   r   r.   r   r3   rC   r   r   r   r   r   r   r	      s<    


"



>

 Gr	   )r   typingr   r   r   r   r   r   r    r   services.json_flattenerr   r	   r   r   r   r   <module>   s     