Source code for llvm_ir_dataset_utils.tools.grep_source

"""Tool for searching all the source files within a corpus"""

import os
import logging

from absl import app
from absl import flags

import ray

from llvm_ir_dataset_utils.util import dataset_corpus
from llvm_ir_dataset_utils.util import parallel

FLAGS = flags.FLAGS

flags.DEFINE_string('corpus_dir', None,
                    'The corpus directory to look for projects in.')
flags.DEFINE_string('search_string', None, 'The string to search for.')

flags.mark_flag_as_required('corpus_dir')
flags.mark_flag_as_required('search_string')

MODULE_CHUNK_SIZE = 32


@ray.remote(num_cpus=1)
def get_source_files_in_project(project_path):
  try:
    bitcode_modules = dataset_corpus.get_bitcode_file_paths(project_path)
  except Exception:
    return []

  return [(project_path, bitcode_module) for bitcode_module in bitcode_modules]


@ray.remote(num_cpus=1)
def process_single_batch(source_file_batch, search_string):
  string_found_source = 0
  string_found_preprocessed_source = 0
  for source_file in source_file_batch:
    project_path, bitcode_file_path = source_file
    source_file_path = f'{bitcode_file_path[:-3]}.source'
    source_file = dataset_corpus.load_file_from_corpus(project_path,
                                                       source_file_path)
    if source_file is None:
      continue
    if source_file.find(search_string.encode('utf-8')) != -1:
      string_found_source += 1

    preprocessed_source_file_path = f'{bitcode_file_path[:-3]}.preprocessed_source'
    preprocessed_source_file = dataset_corpus.load_file_from_corpus(
        project_path, preprocessed_source_file_path)
    if preprocessed_source_file is None:
      continue

    if preprocessed_source_file.find(search_string.encode('utf-8')) != -1:
      string_found_preprocessed_source += 1
  return (string_found_source, string_found_preprocessed_source)


[docs]def grep_projects(project_list):
  logging.info(f'Processing {len(project_list)} projects.')

  project_info_futures = []

  for project_path in project_list:
    project_info_futures.append(
        get_source_files_in_project.remote(project_path))

  project_infos = []

  while len(project_info_futures) > 0:
    to_return = 32 if len(project_info_futures) > 64 else 1
    finished, project_info_futures = ray.wait(
        project_info_futures, timeout=5.0, num_returns=to_return)
    logging.info(
        f'Just finished gathering modules from {len(finished)} projects, {len(project_info_futures)} remaining.'
    )
    for finished_project in ray.get(finished):
      project_infos.extend(finished_project)

  logging.info(
      f'Finished gathering modules, currently have {len(project_infos)}')

  module_batches = parallel.split_batches(project_infos, MODULE_CHUNK_SIZE)

  logging.info(f'Setup {len(module_batches)} batches.')

  module_batch_futures = []

  for module_batch in module_batches:
    module_batch_futures.append(
        process_single_batch.remote(module_batch, FLAGS.search_string))

  total_string_found_source = 0
  total_string_found_preprocessed_source = 0

  while len(module_batch_futures) > 0:
    to_return = 32 if len(module_batch_futures) > 64 else 1
    finished, module_batch_futures = ray.wait(
        module_batch_futures, timeout=5.0, num_returns=to_return)
    logging.info(
        f'Just finished {len(finished)} batches, {len(module_batch_futures)} remaining.'
    )
    for finished_batch in ray.get(finished):
      string_found_source, string_found_preprocessed_source = finished_batch
      total_string_found_source += string_found_source
      total_string_found_preprocessed_source += string_found_preprocessed_source

  logging.info(
      f'Found {total_string_found_source} source files with the specified string.'
  )
  logging.info(
      f'Found {total_string_found_preprocessed_source} preprocessed source files with the specified string.'
  )


[docs]def main(_):
  projects = os.listdir(FLAGS.corpus_dir)

  project_paths = [
      os.path.join(FLAGS.corpus_dir, project_path) for project_path in projects
  ]

  grep_projects(project_paths)


if __name__ == '__main__':
  app.run(main)