EinsDreiDreiSieben/mods/moreblocks/stairsplus/scripts/create_whitelist.py

import argparse
import math
import multiprocessing
import pathlib
import time

import progressbar

import pymtdb
from whitelist_helpers import create_whitelist, write_whitelist, create_filter, count_blocks, existing_file, get_cursor, \
    get_all_nodes


def process_chunk(args, offset, limit, completed, results):
    cursor = get_cursor(args)
    cursor.execute(f'SELECT data FROM blocks LIMIT {limit} OFFSET {offset}')
    node_names = set()
    i = 0
    for i, row in enumerate(cursor, 1):
        node_names.update(pymtdb.MapBlockSimple.import_from_serialized(row[0]).node_names)
        if i % args.chunk_size == 0:
            completed.value = i
    completed.value = i
    results.put(node_names, False)


def main(args):
    num_blocks, count_blocks_elapsed = count_blocks(args)  # 345104538, 13*60
    work_size = math.ceil(num_blocks / args.workers)
    offsets = range(0, num_blocks, work_size)
    completeds = tuple(multiprocessing.Value('Q', 0, lock=False) for _ in range(args.workers))
    # because we want to terminate the processes before we remove the results from the queue, use a manager
    # see warnings in https://docs.python.org/3/library/multiprocessing.html#pipes-and-queues
    results = multiprocessing.Manager().Queue()
    processes = tuple(
        multiprocessing.Process(target=process_chunk, name=f'processor {i}',
                                args=(args, offsets[i], work_size, completeds[i], results))
        for i in range(args.workers)
    )
    for process in processes:
        process.start()

    print(f'NOTICE: not all jobs will start at the same time due to the nature of ranged queries. actual runtime will '
          f'be closer to 1/{min(args.workers, multiprocessing.cpu_count())}th the early estimate, plus '
          f'{count_blocks_elapsed}s.')
    # TODO: if we know how long it takes to count the blocks, and how many workers there are, we can estimate how long
    #       before a process starts producing results, and resize the jobs to maximize processor usage.
    #       proper estimation requires differential equations, ugh.

    with progressbar.ProgressBar(max_value=num_blocks) as bar:
        while True:
            time.sleep(1)
            total_completed = sum(completed.value for completed in completeds)
            bar.update(total_completed)
            if total_completed == num_blocks:
                break

    print('joining...')
    for process in processes:
        process.join()

    print('compiling results...')
    all_nodes = get_all_nodes(results)

    filter_ = create_filter(args.stairsplus_dump)
    whitelist = create_whitelist(filter_, all_nodes)
    write_whitelist(args, whitelist)


def parse_args(args=None, namespace=None):
    p = argparse.ArgumentParser()
    g = p.add_mutually_exclusive_group(required=True)
    g.add_argument('--pg_connection', '-c')
    g.add_argument('--sqlite_file', '-s', type=existing_file)
    p.add_argument('--chunk_size', type=int, default=64)
    p.add_argument('--workers', type=int, default=multiprocessing.cpu_count())
    p.add_argument('--output', '-o', type=pathlib.Path)
    p.add_argument('stairsplus_dump', type=existing_file)
    return p.parse_args(args=args, namespace=namespace)


if __name__ == "__main__":
    main(parse_args())