Source code for llvm_ir_dataset_utils.tools.get_common_constants
"""Tool for getting common tokenizer constants from bitcode modules."""
import os
import logging
import sys
from absl import app
from absl import flags
import ray
from llvm_ir_dataset_utils.util import bitcode_module
from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import parallel
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None,
                    'The corpus directory to look for modules in.')
flags.DEFINE_integer(
    'max_projects',
    sys.maxsize,
    'The maximum number of projects to process.',
    lower_bound=1)
flags.DEFINE_string('output_file', None, 'The output file to place results in.')
flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file')
[docs]def combine_constant_histograms(part_a, part_b):
  result_histogram = {}
  for constant in list(set(list(part_a.keys()) + list(part_b.keys()))):
    if constant in part_b and constant in part_a:
      result_histogram[constant] = part_a[constant] + part_b[constant]
    elif constant in part_a:
      result_histogram[constant] = part_a[constant]
    elif constant in part_b:
      result_histogram[constant] = part_b[constant]
  return result_histogram 
[docs]def get_constants_from_bitcode(project_dir, bitcode_file_path):
  bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
                                                      bitcode_file_path)
  tokenized_functions = bitcode_module.get_tokenization(
      bitcode_file)['functions']
  constant_histogram = {}
  for function in tokenized_functions:
    for token in function['tokens']:
      if token['type'] == 'constant_integer_operand':
        if token['integer_constant'] in constant_histogram:
          constant_histogram[token['integer_constant']] += 1
        else:
          constant_histogram[token['integer_constant']] = 1
  return constant_histogram 
@ray.remote(num_cpus=1)
def get_constants_from_bitcode_batch(project_dir, bitcode_file_paths):
  constant_histogram = {}
  for bitcode_file_path in bitcode_file_paths:
    constant_histogram = combine_constant_histograms(
        constant_histogram,
        get_constants_from_bitcode(project_dir, bitcode_file_path))
  return constant_histogram
@ray.remote(num_cpus=1)
def get_constants_from_project(project_dir):
  try:
    bitcode_file_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
  except Exception:
    return {}
  batches = parallel.split_batches(bitcode_file_paths, 16)
  batch_futures = []
  for batch in batches:
    batch_futures.append(
        get_constants_from_bitcode_batch.remote(project_dir, batch))
  constant_histogram = {}
  constant_histograms = ray.get(batch_futures)
  for partial_constant_histogram in constant_histograms:
    constant_histogram = combine_constant_histograms(
        constant_histogram, partial_constant_histogram)
  return constant_histogram
[docs]def main(_):
  ray.init()
  projects = os.listdir(FLAGS.corpus_dir)
  project_futures = []
  for project in projects:
    project_dir = os.path.join(FLAGS.corpus_dir, project)
    project_futures.append(get_constants_from_project.remote(project_dir))
    if len(project_futures) >= FLAGS.max_projects:
      break
  constant_histogram = {}
  while len(project_futures) > 0:
    finished, project_futures = ray.wait(project_futures, timeout=5.0)
    logging.info(
        f'Just finished {len(finished)}, {len(project_futures)} remaining.')
    for project_histogram in ray.get(finished):
      constant_histogram = combine_constant_histograms(constant_histogram,
                                                       project_histogram)
  with open(FLAGS.output_file, 'w') as output_file:
    for constant in constant_histogram:
      output_file.write(f'{constant},{constant_histogram[constant]}\n') 
if __name__ == '__main__':
  app.run(main)