Source code for llvm_ir_dataset_utils.tools.collect_license_information
"""Tool for collecting license information on all projects and putting it into a
JSON file.
"""
import os
import logging
import json
import shutil
from absl import flags
from absl import app
import ray
from llvm_ir_dataset_utils.util import dataset_corpus
FLAGS = flags.FLAGS
flags.DEFINE_string('corpus_dir', None, 'The base directory of the corpus')
flags.DEFINE_string('output_file', None, 'The path to the output JSON file.')
flags.DEFINE_string('license_dir', None,
                    'The path to place license files in. Optional')
flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file')
@ray.remote(num_cpus=1)
def get_license_information(corpus_path, license_dir):
  build_manifest = dataset_corpus.load_json_from_corpus(
      corpus_path, './build_manifest.json')
  if build_manifest is None:
    return None
  archive_url = ""
  if len(build_manifest["sources"]) == 0:
    # If we don't have any sources listed, this is a spack package
    archive_url = f'spack:{build_manifest["targets"][0]["name"]}'
  else:
    if build_manifest["sources"][-1]["type"] == "git":
      archive_url = build_manifest["sources"][-1]["repo_url"]
    elif build_manifest["sources"][-1]["type"] == "tar":
      archive_url = build_manifest["sources"][-1]["archive_url"]
  if license_dir:
    for license_file in build_manifest["license_files"]:
      license_data = dataset_corpus.load_file_from_corpus(
          corpus_path, license_file["file"])
      if license_data is None:
        logging.warning(
            f'Failed to load license {license_file} in corpus {corpus_path}')
        continue
      with open(os.path.join(license_dir, license_file["file"]),
                "wb") as license_file_handle:
        license_file_handle.write(license_data)
  return (corpus_path, build_manifest['license'],
          build_manifest['license_source'], build_manifest["license_files"],
          archive_url)
[docs]def main(_):
  build_corpora = os.listdir(FLAGS.corpus_dir)
  license_info_futures = []
  for build_corpus in build_corpora:
    corpus_path = os.path.join(FLAGS.corpus_dir, build_corpus)
    license_info_futures.append(
        get_license_information.remote(corpus_path, FLAGS.license_dir))
  raw_license_information = ray.get(license_info_futures)
  license_information = [
      license_info for license_info in raw_license_information
      if license_info is not None
  ]
  with open(FLAGS.output_file, 'w') as output_json_file:
    json.dump(license_information, output_json_file, indent=4) 
if __name__ == '__main__':
  app.run(main)