
    ג[i^+                     h    d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlmZ  G d d          ZdS )aZ  
JSON Loader Service

This module loads JSON records from the ODF directory structure.
It respects folder hierarchy and extracts records relevant to selected collections.

Key principles:
- NO normalization - preserve all fields as-is
- NO field removal - keep all keys
- Support multiple JSON files per collection
- Respect ODF folder structure
    N)Path)ListDictAnyOptionalTuple)JSONFlattenerc            
       n   e Zd ZdZdedeeef         fdZdee         fdZ	dede
e         fdZdedee         fd	Zd
ede
eeef                  fdZdeeef         ded
edeeef         fdZdededeeeef                  fdZdededefdZdedeeef         fdZdededeeef         fdZdS )
JSONLoaderz
    Service for loading JSON records from ODF directory structure.
    
    Loads JSON files while preserving all original fields and structure.
    All mappings are read from configuration.
    	base_pathcollections_configc                 b    t          |          | _        || _        t                      | _        dS )z
        Initialize JSON loader with base path and collections config.
        
        Args:
            base_path: Base path to JSON files (e.g., 'idf-json-sample')
            collections_config: Collections configuration from app_config.yaml
        N)r   r   r   r	   	flattener)selfr   r   s      0/var/www/html/IGF-ODF-V3/services/json_loader.py__init__zJSONLoader.__init__#   s(     i"4&    returnc                     | j                                         sg S g }| j                                         D ]0}|                                r|                    |j                   1t          |          S )z
        Get list of available JSON source directories.
        
        Returns:
            List of directory names (e.g., ['tokyo_2020', 'paris_2024'])
        )r   existsiterdiris_dirappendnamesorted)r   sourcesitems      r   get_available_sourcesz JSONLoader.get_available_sources/   sr     ~$$&& 	IN**,, 	* 	*D{{}} *ty)))gr   collection_namec                 b    | j                             |i           }|                    d          S )z
        Get the ODF folder name for a collection from config.
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            Folder name or None if not configured
        json_folderr   get)r   r   collection_configs      r   get_collection_folderz JSONLoader.get_collection_folder@   s1     !377LL $$]333r   c                     g S )a  
        Get expected document types for a collection.
        
        NOTE: With folder-based validation, we no longer track specific document types.
        This method now returns an empty list, indicating validation is folder-based.
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            Empty list (validation is now folder-based, not type-based)
         )r   r   s     r   get_expected_document_typesz&JSONLoader.get_expected_document_typesM   s	     	r   	file_pathc                     	 t          |dd          5 }t          j        |          cddd           S # 1 swxY w Y   dS # t          $ r }t	          d| d|            Y d}~dS d}~ww xY w)z
        Load and parse a single JSON file.
        
        Args:
            file_path: Path to JSON file
            
        Returns:
            Parsed JSON data or None if error
        rzutf-8)encodingNzError loading z: )openjsonload	Exceptionprint)r   r)   fes       r   load_json_filezJSONLoader.load_json_file\   s    	iw777 $1y||$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ 	 	 	39333344444444	s2   A 5A 9A 9A 
A,A''A,	json_datac           	          | j                             |i           }|                    d          }|sdS |j        j        }||k    rd|j         d| d| d| d	}d|fS dS )	ab  
        Validate that JSON file is in the correct folder for the target collection.
        
        NEW APPROACH: Validation based on folder structure, not document_type.
        Files in the correct folder are automatically valid, regardless of document_type.
        This makes the system flexible and independent of specific document type values.
        
        Args:
            json_data: Parsed JSON data
            collection_name: Target collection name
            file_path: Path to JSON file (for validation)
            
        Returns:
            Tuple of (is_valid, error_message)
        r!   )T zVALIDATION ERROR: File 'z' is in folder 'z' but collection 'z' expects files from folder 'z7'. This file should NOT be loaded into this collection.F)r   r#   parentr   )r   r5   r   r)   r$   expected_folderactual_folder	error_msgs           r   validate_document_typez!JSONLoader.validate_document_typem   s    " !377LL+//>> 	8
 "(- O+ 	$H9> H H= H H#2H HQ`H H H 
 )## xr   source_namec                 v   |                      |          }|g S | j        |z  |z  }|                                r|                                sg S g }g }g }|                    d          D ]}|                     |          }	|	r|                     |	||          \  }
}|
sJ|                    |           |                    |j                   t          d|j         d|            | j
                            |	d          }|D ]B}|j        |d<   t          |          |d<   ||d	<   ||d
<   d|d<   |                    |           C|rjt          d| d           t          dt          |           d           t          dt          |           d           |D ]}t          d|            |S )a|  
        Load all JSON records for a collection from a source directory.
        
        This method:
        1. Finds the appropriate ODF folder
        2. Loads all JSON files in that folder
        3. Flattens ALL nested fields recursively
        4. Returns flattened records with ALL fields preserved
        
        CRITICAL: NO fields are lost. ALL nested data is flattened.
        
        Args:
            collection_name: Name of the collection
            source_name: Name of the source directory (e.g., 'tokyo_2020')
            
        Returns:
            List of flattened records with ALL fields from JSON
        N*.jsonu   ⚠️ SKIPPED: z - Wrong document type for T)extract_arrays_source_file_source_path_source_competition_target_collection_validation_passedu   
⚠️ VALIDATION SUMMARY for :z   - Loaded: z recordsz   - Skipped: z files (wrong document type)z   - )r%   r   r   r   globr4   r<   r   r   r1   r   flatten_json_filestrlen)r   r   r=   folder_namefolder_pathall_recordsvalidation_errorsskipped_filesr)   datais_validr;   flattened_recordsrecorderrors                  r   load_collection_from_sourcez&JSONLoader.load_collection_from_source   s9   . 00AA 	I n{2[@!!## 	;+=+=+?+? 	I %))(33 	/ 	/I&&y11D /&*&A&A$Yb&c&c#) %,,Y777!((888iY^iiXgiijjj %)N$D$DTZ^$D$_$_! 0 / /F-6^F>*-0^^F>*4?F013BF/037F/0&&v....  	'G_GGGHHH<#k"2"2<<<===S3}#5#5SSSTTT* ' 'oeoo&&&&r   c                     |                      |          }|dS | j        |z  |z  }|                                r|                                sdS t	          t          |                    d                              S )a  
        Count JSON files for a collection in a source directory.
        
        Args:
            collection_name: Name of the collection
            source_name: Name of the source directory
            
        Returns:
            Number of JSON files found
        Nr   r?   )r%   r   r   r   rJ   listrG   r   r   r=   rK   rL   s        r   count_files_in_sourcez JSONLoader.count_files_in_source   s     00AA 	1n{2[@!!## 	;+=+=+?+? 	14((2233444r   c                     | j                             |i           }|                    d          }|||du|rd| dnddS )z
        Get validation information for a collection.
        
        Args:
            collection_name: Name of the collection
            
        Returns:
            Dictionary with validation rules
        r!   NzOnly files from folder 'z/' will be loadedzNo folder validation)
collectionr9   validation_enableddescriptionr"   )r   r   r$   rK   s       r   get_validation_infozJSONLoader.get_validation_info   si     !377LL'++M:: **"-T"9XcTkTTTTi	
 
 	
r   c                     |                      |          }|ddddS | j        |z  |z  }d|t          |          |                                |                     ||          dS )a  
        Get information about a collection in a source.
        
        Args:
            collection_name: Name of the collection
            source_name: Name of the source directory
            
        Returns:
            Dictionary with collection info
        NFr   )	supportedfolder
file_countT)r`   ra   pathr   rb   )r%   r   rI   r   rY   rX   s        r   get_collection_infozJSONLoader.get_collection_info  s     00AA 	"   n{2[@ !$$!((**44_kRR
 
 	
r   N)__name__
__module____qualname____doc__rI   r   r   r   r   r   r   r%   r(   r   r4   tupleboolr<   rU   intrY   r^   rd   r'   r   r   r   r      s        
)# 
)4S> 
) 
) 
) 
)tCy    "4S 4Xc] 4 4 4 43 49     $sCx.1I    "'S#X 'QT 'ae 'joptvypyjz ' ' ' 'RJJ J 
d38n		J J J JX55 5 
	5 5 5 56
3 
4S> 
 
 
 
( 
 
  
 
c3h	 
  
  
  
  
  
r   r   )rh   r.   ospathlibr   typingr   r   r   r   r   services.json_flattenerr	   r   r'   r   r   <module>rp      s      				       3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1\
 \
 \
 \
 \
 \
 \
 \
 \
 \
r   