Source code for llvm_ir_dataset_utils.tools.collect_license_information
"""Tool for collecting license information on all projects and putting it into a
JSON file.
"""
import os
import logging
import json
import shutil
from absl import flags
from absl import app
import ray
from llvm_ir_dataset_utils.util import dataset_corpus
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None, 'The base directory of the corpus')
flags.DEFINE_string('output_file', None, 'The path to the output JSON file.')
flags.DEFINE_string('license_dir', None,
'The path to place license files in. Optional')
flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file')
@ray.remote(num_cpus=1)
def get_license_information(corpus_path, license_dir):
build_manifest = dataset_corpus.load_json_from_corpus(
corpus_path, './build_manifest.json')
if build_manifest is None:
return None
archive_url = ""
if len(build_manifest["sources"]) == 0:
# If we don't have any sources listed, this is a spack package
archive_url = f'spack:{build_manifest["targets"][0]["name"]}'
else:
if build_manifest["sources"][-1]["type"] == "git":
archive_url = build_manifest["sources"][-1]["repo_url"]
elif build_manifest["sources"][-1]["type"] == "tar":
archive_url = build_manifest["sources"][-1]["archive_url"]
if license_dir:
for license_file in build_manifest["license_files"]:
license_data = dataset_corpus.load_file_from_corpus(
corpus_path, license_file["file"])
if license_data is None:
logging.warning(
f'Failed to load license {license_file} in corpus {corpus_path}')
continue
with open(os.path.join(license_dir, license_file["file"]),
"wb") as license_file_handle:
license_file_handle.write(license_data)
return (corpus_path, build_manifest['license'],
build_manifest['license_source'], build_manifest["license_files"],
archive_url)
[docs]def main(_):
build_corpora = os.listdir(FLAGS.corpus_dir)
license_info_futures = []
for build_corpus in build_corpora:
corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus)
license_info_futures.append(
get_license_information.remote(corpus_path, FLAGS.license_dir))
raw_license_information = ray.get(license_info_futures)
license_information = [
license_info for license_info in raw_license_information
if license_info is not None
]
with open(FLAGS.output_file, 'w') as output_json_file:
json.dump(license_information, output_json_file, indent=4)
if __name__ == '__main__':
app.run(main)