Source code for llvm_ir_dataset_utils.tools.validate_parquet_db
"""This script loads in a folder of parquet files from the process_to_parquet.py
script and validates some of the fields. This is not an exhaustive validation
and only contains simple smoke tests, such as ensuring fields are not empty.
"""
import logging
import os
from absl import app
from absl import flags
from pyarrow import parquet
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset_path', None, 'The dataset path to validate')
[docs]def main(_):
total_rows = 0
for file_name in os.listdir(FLAGS.dataset_path):
full_file_path = os.path.join(FLAGS.dataset_path, file_name)
# Load the parquet file
current_table = parquet.read_table(
full_file_path,
columns=[
'license_expression', 'license_source', 'license_files',
'package_source', 'language'
]).to_pandas()
warning_count = 0
for index, module_instance in current_table.iterrows():
total_rows += 1
if len(module_instance['license_expression']) == 0:
warning_count += 1
logging.info('License expression empty')
if len(module_instance['license_source']) == 0:
warning_count += 1
logging.info('License source empty')
if len(module_instance['license_files']) == 0:
warning_count += 1
logging.info('License files empty')
if len(module_instance['package_source']) == 0:
warning_count += 1
logging.info('Package source empty')
if len(module_instance['language']) == 0:
warning_count += 1
logging.info('Language field empty')
logging.info('Finished processing individual dataset file.')
logging.info(f'Finished processing dataset, found {total_rows} rows.')
if __name__ == '__main__':
app.run(main)