Source code for llvm_ir_dataset_utils.tools.aggregate_build_sizes
"""Tool for aggregating and providing statistics on bitcode size."""
import os
import logging
from absl import flags
from absl import app
import ray
from llvm_ir_dataset_utils.util import dataset_corpus
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None, 'The base directory of the corpus')
flags.DEFINE_string(
'per_package_output', None,
'The path to a CSV file containing the name of each package and the amount '
'of bitcode that it has.')
flags.mark_flag_as_required('corpus_dir')
@ray.remote
def get_size_from_manifest(corpus_path):
build_manifest = dataset_corpus.load_json_from_corpus(
corpus_path, "./build_manifest.json")
package_name = dataset_corpus.get_corpus_name(corpus_path)
if build_manifest is None:
return (package_name, 0, False)
return (package_name, build_manifest['size'])
[docs]def main(_):
build_corpora = os.listdir(FLAGS.corpus_dir)
logging.info(f'Gathering data from {len(build_corpora)} builds.')
size_futures = []
for build_corpus in build_corpora:
corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus)
size_futures.append(get_size_from_manifest.remote(corpus_path))
names_sizes = ray.get(size_futures)
size_sum = 0
for name_size in names_sizes:
size_sum += name_size[1]
logging.info(f'Aggregate size:{size_sum}')
if FLAGS.per_package_output is not None:
names_sizes = sorted(
names_sizes, key=lambda name_size: name_size[1], reverse=True)
with open(FLAGS.per_package_output, 'w') as per_package_index_file:
for name_size in names_sizes:
per_package_index_file.write(f'{name_size[0]},{name_size[1]}\n')
if __name__ == '__main__':
app.run(main)