Source code for llvm_ir_dataset_utils.tools.link_files
"""Tool for running llvm-link over all bitcode files in a corpus."""
import pathlib
import os
import subprocess
import logging
from absl import app
from absl import flags
import ray
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None, 'The path to the corpus directory.')
flags.DEFINE_string('output_dir', None, 'The path to the output directory.')
flags.mark_flag_as_required('corpus_dir')
@ray.remote(num_cpus=1)
def link_package(folder_path, output_dir):
# TODO(boomanaiden154): Pull from a corpus_manifest/meta corpus manifest
# rather than glob for the bitcode files once they're available in all of
# my builds.
bitcode_files_gen = pathlib.Path(folder_path).glob('**/*.bc')
bitcode_files = list(bitcode_files_gen)
if len(bitcode_files) == 0:
return (False, None)
command_vector = ['llvm-link']
command_vector.append(bitcode_files[0])
for bitcode_file in bitcode_files[1:]:
command_vector.extend(['-override', bitcode_file])
package_name = os.path.basename(folder_path)
output_file_path = os.path.join(output_dir, package_name + '.bc')
command_vector.extend(['-o', output_file_path])
try:
command_output = subprocess.run(
command_vector, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except OSError:
return (False, None)
if command_output.returncode == 0:
return (True, output_file_path)
else:
return (False, output_file_path)
[docs]def main(_):
pathlib.Path(FLAGS.output_dir).mkdir(exist_ok=True, parents=True)
corpus_folders = os.listdir(FLAGS.corpus_dir)
package_processing_futures = []
for corpus_folder in corpus_folders:
corpus_folder_full_path = os.path.join(FLAGS.corpus_dir, corpus_folder)
package_processing_future = link_package.remote(corpus_folder_full_path,
FLAGS.output_dir)
package_processing_futures.append(package_processing_future)
link_success = 0
link_failures = []
while len(package_processing_futures) > 0:
to_wait_for = 128
if len(package_processing_futures) < 256:
to_wait_for = 1
finished, package_processing_futures = ray.wait(
package_processing_futures, timeout=5.0, num_returns=to_wait_for)
finished_data = ray.get(finished)
for finished_link in finished_data:
if finished_link[0]:
link_success += 1
else:
link_failures.append(finished_link[1])
logging.info(
f'Just finished {len(finished_data)}, {len(package_processing_futures)} remaining.'
)
logging.info(
f'Got {link_success} successes and {len(link_failures)} failures.')
if __name__ == '__main__':
app.run(main)