Source code for llvm_ir_dataset_utils.builders.cargo_builder

"""Module for building and extracting bitcode from applications using cargo"""

import subprocess
import os
import json
import multiprocessing
import shutil
import pathlib
import logging

import ray

from mlgo.corpus import make_corpus_lib
from mlgo.corpus import combine_training_corpus_lib

BUILD_TIMEOUT = 900


[docs]def get_spec_from_id(id): sections = id.split('(') file_path = sections[1][5:-1] name_version = sections[0].split(' ') name = name_version[0] version = name_version[1] return f'{file_path}#{name}@{version}'
[docs]def get_packages_from_manifest(source_dir): command_vector = ["cargo", "metadata", "--no-deps"] if not os.path.exists(source_dir): return [] try: # TODO(boomanaiden154): Dump the stderr of the metadata command to a log # somewhere out = subprocess.check_output( command_vector, cwd=source_dir, stderr=subprocess.PIPE) manifest = json.loads(out.decode("utf-8")) packages = {} for package in manifest["packages"]: targets = [] for target in package["targets"]: targets.append({ "name": target["name"], "kind": target["kind"][0], "spec": get_spec_from_id(package['id']), "package": package['name'] }) packages[package["name"]] = targets return packages except subprocess.SubprocessError: return []
[docs]def get_build_log_name(target): return './' + target['name'] + '.' + target['kind'] + '.build.log'
[docs]def build_all_targets(source_dir, build_dir, corpus_dir, threads, extra_env_variables, cleanup): package_list = get_packages_from_manifest(source_dir) build_log = {'targets': []} package_futures = [] for package in package_list: package_build_dir = build_dir + '-' + package package_futures.append( build_package_future(source_dir, package_build_dir, corpus_dir, package_list[package], threads, extra_env_variables, cleanup)) package_build_logs = ray.get(package_futures) for package_build_log in package_build_logs: build_log['targets'].extend(package_build_log) combine_training_corpus_lib.combine_corpus(corpus_dir) return build_log
[docs]def build_package_future(source_dir, build_dir, corpus_dir, targets, threads, extra_env_variables, cleanup): return build_package.options(num_cpus=threads).remote(source_dir, build_dir, corpus_dir, targets, threads, extra_env_variables, cleanup)
@ray.remote(num_cpus=multiprocessing.cpu_count()) def build_package(source_dir, build_dir, corpus_dir, targets, threads, extra_env_variables, cleanup): build_log = [] for target in targets: build_log.append( perform_build(source_dir, build_dir, corpus_dir, target, threads, extra_env_variables)) package_corpus_dir = os.path.join(corpus_dir, targets[0]["package"]) # We should never be creating the parents of the folder as they should be # provided by builder.py and the folder should never exist before we create # it. pathlib.Path(package_corpus_dir).mkdir(exist_ok=False, parents=False) extract_ir(build_dir, package_corpus_dir) if cleanup: if os.path.exists(build_dir): try: shutil.rmtree(build_dir) except Exception: logging.warn( f'Failed to delete directory {build_dir}, probably deleted by another process.' ) return build_log
[docs]def perform_build(source_dir, build_dir, corpus_dir, target, threads, extra_env_variables): logging.info( f"Building target {target['name']} of type {target['kind']} from package {target['package']}" ) build_env = os.environ.copy() build_env["CARGO_TARGET_DIR"] = build_dir build_env.update(extra_env_variables) build_command_vector = [ "cargo", "rustc", "-p", f"{target['spec']}", "-j", str(threads) ] if target['kind'] == "lib": build_command_vector.append("--lib") elif target['kind'] == "test": build_command_vector.extend(["--test", target['name']]) elif target['kind'] == "bench": build_command_vector.extend(["--bench", target['name']]) elif target['kind'] == "bin": build_command_vector.extend(["--bin", target['name']]) elif target['kind'] == "example": build_command_vector.extend(["--example", target['name']]) else: logging.warn( f'{target["name"]} has unrecognized target type {target["kind"]} in package {target["package"]}' ) return { 'success': False, 'build_log': None, 'name': target['name'] + '.' + target['kind'] } build_command_vector.extend( ["--", '--emit=llvm-bc', '-C', 'no-prepopulate-passes']) try: build_log_path = os.path.join(corpus_dir, get_build_log_name(target)) with open(build_log_path, 'w') as build_log_file: subprocess.run( build_command_vector, cwd=source_dir, env=build_env, check=True, stdout=build_log_file, stderr=build_log_file, timeout=BUILD_TIMEOUT) except subprocess.SubprocessError: logging.warn( f"Failed to build target {target['name']} of type {target['kind']} from package {target['package']}" ) build_success = False else: logging.info( f"Finished building target {target['name']} of type {target['kind']} from package {target['package']}" ) build_success = True return { 'success': build_success, 'build_log': get_build_log_name(target), 'name': target['name'] + '.' + target['kind'] }
[docs]def extract_ir(build_dir, corpus_dir): # TODO(boomanaiden154): Look into getting a build manifest from cargo. relative_paths = make_corpus_lib.load_bitcode_from_directory(build_dir) make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir) make_corpus_lib.write_corpus_manifest(relative_paths, corpus_dir, '')