Source code for llvm_ir_dataset_utils.tools.get_common_constants
"""Tool for getting common tokenizer constants from bitcode modules."""
import os
import logging
import sys
from absl import app
from absl import flags
import ray
from llvm_ir_dataset_utils.util import bitcode_module
from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import parallel
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None,
'The corpus directory to look for modules in.')
flags.DEFINE_integer(
'max_projects',
sys.maxsize,
'The maximum number of projects to process.',
lower_bound=1)
flags.DEFINE_string('output_file', None, 'The output file to place results in.')
flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file')
[docs]def combine_constant_histograms(part_a, part_b):
result_histogram = {}
for constant in list(set(list(part_a.keys()) + list(part_b.keys()))):
if constant in part_b and constant in part_a:
result_histogram[constant] = part_a[constant] + part_b[constant]
elif constant in part_a:
result_histogram[constant] = part_a[constant]
elif constant in part_b:
result_histogram[constant] = part_b[constant]
return result_histogram
[docs]def get_constants_from_bitcode(project_dir, bitcode_file_path):
bitcode_file = dataset_corpus.load_file_from_corpus(project_dir,
bitcode_file_path)
tokenized_functions = bitcode_module.get_tokenization(
bitcode_file)['functions']
constant_histogram = {}
for function in tokenized_functions:
for token in function['tokens']:
if token['type'] == 'constant_integer_operand':
if token['integer_constant'] in constant_histogram:
constant_histogram[token['integer_constant']] += 1
else:
constant_histogram[token['integer_constant']] = 1
return constant_histogram
@ray.remote(num_cpus=1)
def get_constants_from_bitcode_batch(project_dir, bitcode_file_paths):
constant_histogram = {}
for bitcode_file_path in bitcode_file_paths:
constant_histogram = combine_constant_histograms(
constant_histogram,
get_constants_from_bitcode(project_dir, bitcode_file_path))
return constant_histogram
@ray.remote(num_cpus=1)
def get_constants_from_project(project_dir):
try:
bitcode_file_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
except Exception:
return {}
batches = parallel.split_batches(bitcode_file_paths, 16)
batch_futures = []
for batch in batches:
batch_futures.append(
get_constants_from_bitcode_batch.remote(project_dir, batch))
constant_histogram = {}
constant_histograms = ray.get(batch_futures)
for partial_constant_histogram in constant_histograms:
constant_histogram = combine_constant_histograms(
constant_histogram, partial_constant_histogram)
return constant_histogram
[docs]def main(_):
ray.init()
projects = os.listdir(FLAGS.corpus_dir)
project_futures = []
for project in projects:
project_dir = os.path.join(FLAGS.corpus_dir, project)
project_futures.append(get_constants_from_project.remote(project_dir))
if len(project_futures) >= FLAGS.max_projects:
break
constant_histogram = {}
while len(project_futures) > 0:
finished, project_futures = ray.wait(project_futures, timeout=5.0)
logging.info(
f'Just finished {len(finished)}, {len(project_futures)} remaining.')
for project_histogram in ray.get(finished):
constant_histogram = combine_constant_histograms(constant_histogram,
project_histogram)
with open(FLAGS.output_file, 'w') as output_file:
for constant in constant_histogram:
output_file.write(f'{constant},{constant_histogram[constant]}\n')
if __name__ == '__main__':
app.run(main)