Source code for llvm_ir_dataset_utils.tools.collect_textual_ir
"""A script for collecting a large amount of textual IR into a single file,
aimed primarily at training basic BPE tokenizers."""
import os
import logging
import subprocess
from absl import app
from absl import flags
from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import bitcode_module
FLAGS = flags.FLAGS
flags.DEFINE_multi_string(
'corpus_dir', None,
'The corpora to use for generating the set of textual IR.')
flags.DEFINE_string('output_file', None,
'The output file to put all the textual IR into.')
flags.DEFINE_integer('max_projects', 10,
'The maximum number of projects per corpus.')
flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('output_file')
[docs]def process_single_project(project_dir):
all_textual_ir = ''
try:
bitcode_paths = dataset_corpus.get_bitcode_file_paths(project_dir)
except Exception:
return ''
for bitcode_path in bitcode_paths:
bitcode_file_data = dataset_corpus.load_file_from_corpus(
project_dir, bitcode_path)
textual_ir_or_error = bitcode_module.get_textual_ir(bitcode_file_data)
if textual_ir_or_error[0]:
continue
all_textual_ir += textual_ir_or_error[1]
return all_textual_ir
[docs]def main(_):
all_textual_ir = ''
for corpus_dir in FLAGS.corpus_dir:
for project_dir in os.listdir(corpus_dir)[:FLAGS.max_projects]:
logging.info(f'Processing {project_dir} in {corpus_dir}')
full_project_dir = os.path.join(corpus_dir, project_dir)
all_textual_ir += process_single_project(full_project_dir)
with open(FLAGS.output_file, 'w') as output_file:
output_file.write(all_textual_ir)
if __name__ == '__main__':
app.run(main)