Source code for llvm_ir_dataset_utils.tools.aggregate_build_sizes

"""Tool for aggregating and providing statistics on bitcode size."""

import os
import logging

from absl import flags
from absl import app

import ray

from llvm_ir_dataset_utils.util import dataset_corpus

FLAGS = flags.FLAGS

flags.DEFINE_string('corpus_dir', None, 'The base directory of the corpus')
flags.DEFINE_string(
    'per_package_output', None,
    'The path to a CSV file containing the name of each package and the amount '
    'of bitcode that it has.')

flags.mark_flag_as_required('corpus_dir')


@ray.remote
def get_size_from_manifest(corpus_path):
  build_manifest = dataset_corpus.load_json_from_corpus(
      corpus_path, "./build_manifest.json")
  package_name = dataset_corpus.get_corpus_name(corpus_path)
  if build_manifest is None:
    return (package_name, 0, False)
  return (package_name, build_manifest['size'])


[docs]def main(_): build_corpora = os.listdir(FLAGS.corpus_dir) logging.info(f'Gathering data from {len(build_corpora)} builds.') size_futures = [] for build_corpus in build_corpora: corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus) size_futures.append(get_size_from_manifest.remote(corpus_path)) names_sizes = ray.get(size_futures) size_sum = 0 for name_size in names_sizes: size_sum += name_size[1] logging.info(f'Aggregate size:{size_sum}') if FLAGS.per_package_output is not None: names_sizes = sorted( names_sizes, key=lambda name_size: name_size[1], reverse=True) with open(FLAGS.per_package_output, 'w') as per_package_index_file: for name_size in names_sizes: per_package_index_file.write(f'{name_size[0]},{name_size[1]}\n')
if __name__ == '__main__': app.run(main)