"""Tool for getting statistics on bitcode modules."""
import os
import logging
import csv
import sys
from absl import app
from absl import flags
import ray
from llvm_ir_dataset_utils.util import bitcode_module
from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import parallel
MODULE_STATISTICS_TYPES = [
'parsing', 'module_size', 'module_size_text', 'get_lowered_size',
'get_opt_lowered_size', 'call_names', 'function_hashes',
'module_properties', 'module_hashes', 'module_instruction_distribution',
'defined_function_names', 'token_count', 'post_O3_function_hashes',
'module_instruction_distribution_O3', 'module_properties_O3',
'hf_token_count'
]
FUNCTION_STATISTICS_TYPES = [
'properties', 'passes', 'post_opt_properties', 'instruction_distribution'
]
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None,
'The corpus directory to look for modules in.')
flags.DEFINE_string('output_file_path', None, 'The output file.')
flags.DEFINE_enum('type', 'properties',
MODULE_STATISTICS_TYPES + FUNCTION_STATISTICS_TYPES,
'The type of statistics to collect.')
flags.DEFINE_integer(
'max_projects',
sys.maxsize,
'The maximum number of projects to process.',
lower_bound=1)
flags.DEFINE_string('error_file_path', None, 'The path to log errors in.')
flags.DEFINE_enum(
'language_filter', 'none', ['c', 'cpp', 'none'], 'Specify a '
'language to filter for. This is mostly aimed at filtering '
'for c/c++ which can coexist in the same project.')
flags.DEFINE_string(
'vocab_path', None, 'The path to the vocab '
'file for doing BPE tokenization. Only used for the '
'token_count module statistics.')
flags.DEFINE_string(
'project_filter', None,
'A filter for projects. If the filter string is present in the project '
'name, it is included in the statistics.')
flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file_path')
BITCODE_MODULE_CHUNK_SIZE = 32
@ray.remote(num_cpus=1)
def get_statistics_module_functions(project_dir, bitcode_file_path,
statistics_type):
bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
bitcode_file_path)
module_path = f'{project_dir}:{bitcode_file_path}'
return bitcode_module.get_bitcode_module_function_statistics(
bitcode_file, statistics_type, module_path)
@ray.remote(num_cpus=1)
def process_single_project(project_dir, statistics_type, language_filter,
extra_properties):
statistics = []
try:
bitcode_modules = dataset_corpus.get_bitcode_file_paths(project_dir)
except Exception:
return []
module_futures = []
if statistics_type in MODULE_STATISTICS_TYPES:
# We're computing a module level statistic. Split modules into batches
# and then compute statistics over them.
batches = parallel.split_batches(bitcode_modules, BITCODE_MODULE_CHUNK_SIZE)
for batch in batches:
module_futures.append(
bitcode_module.get_module_statistics_batch.remote(
project_dir, batch, statistics_type, language_filter,
extra_properties))
else:
for bitcode_file_path in bitcode_modules:
module_futures.append(
get_statistics_module_functions.remote(project_dir, bitcode_file_path,
statistics_type))
module_statistics = ray.get(module_futures)
for module_statistic in module_statistics:
statistics.extend(module_statistic)
return statistics
[docs]def collect_statistics(projects_list, statistics_type):
project_futures = []
for project_dir in projects_list:
if FLAGS.project_filter:
if FLAGS.project_filter not in project_dir:
continue
full_project_path = os.path.join(FLAGS.corpus_dir, project_dir)
extra_properties = {'bpe_vocab_path': FLAGS.vocab_path}
project_futures.append(
process_single_project.remote(full_project_path, statistics_type,
FLAGS.language_filter, extra_properties))
if len(project_futures) >= FLAGS.max_projects:
break
statistics = []
while len(project_futures) > 0:
to_return = 128 if len(project_futures) > 256 else 1
finished, project_futures = ray.wait(
project_futures, timeout=5.0, num_returns=to_return)
logging.info(
f'Just finished {len(finished)}, {len(project_futures)} remaining.')
for project_statistics in ray.get(finished):
statistics.extend(project_statistics)
combined_statistics = {}
errors = []
for statistic in statistics:
if statistic[0]:
errors.append(statistic)
else:
individual_data = statistic[1]
data_length = 0
if len(individual_data) != 0:
data_length = len(next(iter(individual_data.values())))
individual_data['name'] = [statistic[2]] * data_length
if 'instruction_distribution' in statistics_type or 'properties' in statistics_type:
fill_value = 0
else:
fill_value = False
combined_statistics = bitcode_module.combine_statistics(
combined_statistics, individual_data, fill_value)
if FLAGS.error_file_path:
with open(FLAGS.error_file_path, 'w') as error_file:
for error in errors:
error_file.write(f'{error[2]},{error[0]}\n')
logging.info('Writing statistics to csv file.')
with open(FLAGS.output_file_path, 'w') as output_file:
csv_writer = csv.writer(output_file)
csv_writer.writerow(combined_statistics.keys())
csv_writer.writerows(zip(*combined_statistics.values()))
[docs]def main(_):
ray.init()
# Perform some basic input validation
if FLAGS.type == 'token_count' and FLAGS.vocab_path is None:
logging.fatal('A vocab path must be specified when gathering token counts.')
sys.exit(1)
projects = os.listdir(FLAGS.corpus_dir)
collect_statistics(projects, FLAGS.type)
if __name__ == '__main__':
app.run(main)