Source code for llvm_ir_dataset_utils.tools.module_statistics

"""Tool for getting statistics on bitcode modules."""

import os
import logging
import csv
import sys

from absl import app
from absl import flags

import ray

from llvm_ir_dataset_utils.util import bitcode_module
from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import parallel

MODULE_STATISTICS_TYPES = [
    'parsing', 'module_size', 'module_size_text', 'get_lowered_size',
    'get_opt_lowered_size', 'call_names', 'function_hashes',
    'module_properties', 'module_hashes', 'module_instruction_distribution',
    'defined_function_names', 'token_count', 'post_O3_function_hashes',
    'module_instruction_distribution_O3', 'module_properties_O3',
    'hf_token_count'
]

FUNCTION_STATISTICS_TYPES = [
    'properties', 'passes', 'post_opt_properties', 'instruction_distribution'
]

FLAGS = flags.FLAGS

flags.DEFINE_string('corpus_dir', None,
                    'The corpus directory to look for modules in.')
flags.DEFINE_string('output_file_path', None, 'The output file.')
flags.DEFINE_enum('type', 'properties',
                  MODULE_STATISTICS_TYPES + FUNCTION_STATISTICS_TYPES,
                  'The type of statistics to collect.')
flags.DEFINE_integer(
    'max_projects',
    sys.maxsize,
    'The maximum number of projects to process.',
    lower_bound=1)
flags.DEFINE_string('error_file_path', None, 'The path to log errors in.')
flags.DEFINE_enum(
    'language_filter', 'none', ['c', 'cpp', 'none'], 'Specify a '
    'language to filter for. This is mostly aimed at filtering '
    'for c/c++ which can coexist in the same project.')
flags.DEFINE_string(
    'vocab_path', None, 'The path to the vocab '
    'file for doing BPE tokenization. Only used for the '
    'token_count module statistics.')
flags.DEFINE_string(
    'project_filter', None,
    'A filter for projects. If the filter string is present in the project '
    'name, it is included in the statistics.')

flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file_path')

BITCODE_MODULE_CHUNK_SIZE = 32


@ray.remote(num_cpus=1)
def get_statistics_module_functions(project_dir, bitcode_file_path,
                                    statistics_type):
  bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
                                                      bitcode_file_path)
  module_path = f'{project_dir}:{bitcode_file_path}'
  return bitcode_module.get_bitcode_module_function_statistics(
      bitcode_file, statistics_type, module_path)


@ray.remote(num_cpus=1)
def process_single_project(project_dir, statistics_type, language_filter,
                           extra_properties):
  statistics = []
  try:
    bitcode_modules = dataset_corpus.get_bitcode_file_paths(project_dir)
  except Exception:
    return []

  module_futures = []
  if statistics_type in MODULE_STATISTICS_TYPES:
    # We're computing a module level statistic. Split modules into batches
    # and then compute statistics over them.
    batches = parallel.split_batches(bitcode_modules, BITCODE_MODULE_CHUNK_SIZE)
    for batch in batches:
      module_futures.append(
          bitcode_module.get_module_statistics_batch.remote(
              project_dir, batch, statistics_type, language_filter,
              extra_properties))
  else:
    for bitcode_file_path in bitcode_modules:
      module_futures.append(
          get_statistics_module_functions.remote(project_dir, bitcode_file_path,
                                                 statistics_type))

  module_statistics = ray.get(module_futures)
  for module_statistic in module_statistics:
    statistics.extend(module_statistic)
  return statistics


[docs]def collect_statistics(projects_list, statistics_type):
  project_futures = []

  for project_dir in projects_list:
    if FLAGS.project_filter:
      if FLAGS.project_filter not in project_dir:
        continue
    full_project_path = os.path.join(FLAGS.corpus_dir, project_dir)
    extra_properties = {'bpe_vocab_path': FLAGS.vocab_path}
    project_futures.append(
        process_single_project.remote(full_project_path, statistics_type,
                                      FLAGS.language_filter, extra_properties))
    if len(project_futures) >= FLAGS.max_projects:
      break

  statistics = []

  while len(project_futures) > 0:
    to_return = 128 if len(project_futures) > 256 else 1
    finished, project_futures = ray.wait(
        project_futures, timeout=5.0, num_returns=to_return)
    logging.info(
        f'Just finished {len(finished)}, {len(project_futures)} remaining.')
    for project_statistics in ray.get(finished):
      statistics.extend(project_statistics)

  combined_statistics = {}
  errors = []
  for statistic in statistics:
    if statistic[0]:
      errors.append(statistic)
    else:
      individual_data = statistic[1]
      data_length = 0
      if len(individual_data) != 0:
        data_length = len(next(iter(individual_data.values())))
      individual_data['name'] = [statistic[2]] * data_length
      if 'instruction_distribution' in statistics_type or 'properties' in statistics_type:
        fill_value = 0
      else:
        fill_value = False
      combined_statistics = bitcode_module.combine_statistics(
          combined_statistics, individual_data, fill_value)

  if FLAGS.error_file_path:
    with open(FLAGS.error_file_path, 'w') as error_file:
      for error in errors:
        error_file.write(f'{error[2]},{error[0]}\n')

  logging.info('Writing statistics to csv file.')

  with open(FLAGS.output_file_path, 'w') as output_file:
    csv_writer = csv.writer(output_file)
    csv_writer.writerow(combined_statistics.keys())
    csv_writer.writerows(zip(*combined_statistics.values()))


[docs]def main(_):
  ray.init()

  # Perform some basic input validation
  if FLAGS.type == 'token_count' and FLAGS.vocab_path is None:
    logging.fatal('A vocab path must be specified when gathering token counts.')
    sys.exit(1)

  projects = os.listdir(FLAGS.corpus_dir)

  collect_statistics(projects, FLAGS.type)


if __name__ == '__main__':
  app.run(main)