EinsDreiDreiSieben/mods/moreblocks/stairsplus/scripts/create_whitelist.py

83 lines
3.2 KiB
Python

import argparse
import math
import multiprocessing
import pathlib
import time
import progressbar
import pymtdb
from whitelist_helpers import create_whitelist, write_whitelist, create_filter, count_blocks, existing_file, get_cursor, \
get_all_nodes
def process_chunk(args, offset, limit, completed, results):
cursor = get_cursor(args)
cursor.execute(f'SELECT data FROM blocks LIMIT {limit} OFFSET {offset}')
node_names = set()
i = 0
for i, row in enumerate(cursor, 1):
node_names.update(pymtdb.MapBlockSimple.import_from_serialized(row[0]).node_names)
if i % args.chunk_size == 0:
completed.value = i
completed.value = i
results.put(node_names, False)
def main(args):
num_blocks, count_blocks_elapsed = count_blocks(args) # 345104538, 13*60
work_size = math.ceil(num_blocks / args.workers)
offsets = range(0, num_blocks, work_size)
completeds = tuple(multiprocessing.Value('Q', 0, lock=False) for _ in range(args.workers))
# because we want to terminate the processes before we remove the results from the queue, use a manager
# see warnings in https://docs.python.org/3/library/multiprocessing.html#pipes-and-queues
results = multiprocessing.Manager().Queue()
processes = tuple(
multiprocessing.Process(target=process_chunk, name=f'processor {i}',
args=(args, offsets[i], work_size, completeds[i], results))
for i in range(args.workers)
)
for process in processes:
process.start()
print(f'NOTICE: not all jobs will start at the same time due to the nature of ranged queries. actual runtime will '
f'be closer to 1/{min(args.workers, multiprocessing.cpu_count())}th the early estimate, plus '
f'{count_blocks_elapsed}s.')
# TODO: if we know how long it takes to count the blocks, and how many workers there are, we can estimate how long
# before a process starts producing results, and resize the jobs to maximize processor usage.
# proper estimation requires differential equations, ugh.
with progressbar.ProgressBar(max_value=num_blocks) as bar:
while True:
time.sleep(1)
total_completed = sum(completed.value for completed in completeds)
bar.update(total_completed)
if total_completed == num_blocks:
break
print('joining...')
for process in processes:
process.join()
print('compiling results...')
all_nodes = get_all_nodes(results)
filter_ = create_filter(args.stairsplus_dump)
whitelist = create_whitelist(filter_, all_nodes)
write_whitelist(args, whitelist)
def parse_args(args=None, namespace=None):
p = argparse.ArgumentParser()
g = p.add_mutually_exclusive_group(required=True)
g.add_argument('--pg_connection', '-c')
g.add_argument('--sqlite_file', '-s', type=existing_file)
p.add_argument('--chunk_size', type=int, default=64)
p.add_argument('--workers', type=int, default=multiprocessing.cpu_count())
p.add_argument('--output', '-o', type=pathlib.Path)
p.add_argument('stairsplus_dump', type=existing_file)
return p.parse_args(args=args, namespace=namespace)
if __name__ == "__main__":
main(parse_args())