Source code for llvm_ir_dataset_utils.tools.link_files

"""Tool for running llvm-link over all bitcode files in a corpus."""

import pathlib
import os
import subprocess
import logging

from absl import app
from absl import flags

import ray

FLAGS = flags.FLAGS

flags.DEFINE_string('corpus_dir', None, 'The path to the corpus directory.')
flags.DEFINE_string('output_dir', None, 'The path to the output directory.')

flags.mark_flag_as_required('corpus_dir')


@ray.remote(num_cpus=1)
def link_package(folder_path, output_dir):
  # TODO(boomanaiden154): Pull from a corpus_manifest/meta corpus manifest
  # rather than glob for the bitcode files once they're available in all of
  # my builds.
  bitcode_files_gen = pathlib.Path(folder_path).glob('**/*.bc')
  bitcode_files = list(bitcode_files_gen)

  if len(bitcode_files) == 0:
    return (False, None)

  command_vector = ['llvm-link']

  command_vector.append(bitcode_files[0])
  for bitcode_file in bitcode_files[1:]:
    command_vector.extend(['-override', bitcode_file])

  package_name = os.path.basename(folder_path)
  output_file_path = os.path.join(output_dir, package_name + '.bc')
  command_vector.extend(['-o', output_file_path])

  try:
    command_output = subprocess.run(
        command_vector, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  except OSError:
    return (False, None)

  if command_output.returncode == 0:
    return (True, output_file_path)
  else:
    return (False, output_file_path)


[docs]def main(_): pathlib.Path(FLAGS.output_dir).mkdir(exist_ok=True, parents=True) corpus_folders = os.listdir(FLAGS.corpus_dir) package_processing_futures = [] for corpus_folder in corpus_folders: corpus_folder_full_path = os.path.join(FLAGS.corpus_dir, corpus_folder) package_processing_future = link_package.remote(corpus_folder_full_path, FLAGS.output_dir) package_processing_futures.append(package_processing_future) link_success = 0 link_failures = [] while len(package_processing_futures) > 0: to_wait_for = 128 if len(package_processing_futures) < 256: to_wait_for = 1 finished, package_processing_futures = ray.wait( package_processing_futures, timeout=5.0, num_returns=to_wait_for) finished_data = ray.get(finished) for finished_link in finished_data: if finished_link[0]: link_success += 1 else: link_failures.append(finished_link[1]) logging.info( f'Just finished {len(finished_data)}, {len(package_processing_futures)} remaining.' ) logging.info( f'Got {link_success} successes and {len(link_failures)} failures.')
if __name__ == '__main__': app.run(main)