Source code for llvm_ir_dataset_utils.tools.grep_source

"""Tool for searching all the source files within a corpus"""

import os
import logging

from absl import app
from absl import flags

import ray

from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import parallel

FLAGS = flags.FLAGS

flags.DEFINE_string('corpus_dir', None,
                    'The corpus directory to look for projects in.')
flags.DEFINE_string('search_string', None, 'The string to search for.')

flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('search_string')

MODULE_CHUNK_SIZE = 32


@ray.remote(num_cpus=1)
def get_source_files_in_project(project_path):
  try:
    bitcode_modules = dataset_corpus.get_bitcode_file_paths(project_path)
  except Exception:
    return []

  return [(project_path, bitcode_module) for bitcode_module in bitcode_modules]


@ray.remote(num_cpus=1)
def process_single_batch(source_file_batch, search_string):
  string_found_source = 0
  string_found_preprocessed_source = 0
  for source_file in source_file_batch:
    project_path, bitcode_file_path = source_file
    source_file_path = f'{bitcode_file_path[:-3]}.source'
    source_file = dataset_corpus.load_file_from_corpus(project_path,
                                                       source_file_path)
    if source_file is None:
      continue
    if source_file.find(search_string.encode('utf-8')) != -1:
      string_found_source += 1

    preprocessed_source_file_path = f'{bitcode_file_path[:-3]}.preprocessed_source'
    preprocessed_source_file = dataset_corpus.load_file_from_corpus(
        project_path, preprocessed_source_file_path)
    if preprocessed_source_file is None:
      continue

    if preprocessed_source_file.find(search_string.encode('utf-8')) != -1:
      string_found_preprocessed_source += 1
  return (string_found_source, string_found_preprocessed_source)


[docs]def grep_projects(project_list): logging.info(f'Processing {len(project_list)} projects.') project_info_futures = [] for project_path in project_list: project_info_futures.append( get_source_files_in_project.remote(project_path)) project_infos = [] while len(project_info_futures) > 0: to_return = 32 if len(project_info_futures) > 64 else 1 finished, project_info_futures = ray.wait( project_info_futures, timeout=5.0, num_returns=to_return) logging.info( f'Just finished gathering modules from {len(finished)} projects, {len(project_info_futures)} remaining.' ) for finished_project in ray.get(finished): project_infos.extend(finished_project) logging.info( f'Finished gathering modules, currently have {len(project_infos)}') module_batches = parallel.split_batches(project_infos, MODULE_CHUNK_SIZE) logging.info(f'Setup {len(module_batches)} batches.') module_batch_futures = [] for module_batch in module_batches: module_batch_futures.append( process_single_batch.remote(module_batch, FLAGS.search_string)) total_string_found_source = 0 total_string_found_preprocessed_source = 0 while len(module_batch_futures) > 0: to_return = 32 if len(module_batch_futures) > 64 else 1 finished, module_batch_futures = ray.wait( module_batch_futures, timeout=5.0, num_returns=to_return) logging.info( f'Just finished {len(finished)} batches, {len(module_batch_futures)} remaining.' ) for finished_batch in ray.get(finished): string_found_source, string_found_preprocessed_source = finished_batch total_string_found_source += string_found_source total_string_found_preprocessed_source += string_found_preprocessed_source logging.info( f'Found {total_string_found_source} source files with the specified string.' ) logging.info( f'Found {total_string_found_preprocessed_source} preprocessed source files with the specified string.' )
[docs]def main(_): projects = os.listdir(FLAGS.corpus_dir) project_paths = [ os.path.join(FLAGS.corpus_dir, project_path) for project_path in projects ] grep_projects(project_paths)
if __name__ == '__main__': app.run(main)