words/scripts/gen_wordlist.py

67 lines
1.9 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import json
class WordListItem:
def __init__(self, line):
word = line.split('\t')
self.word = word[1] if word[1].isalpha() else word[3]
self.pos = word[2]
self.freq = int(word[4])
def __repr__(self):
return f"<WordListItem {self.word=} {self.pos=} {self.freq=}>"
def get_args():
""" Get command line arguments """
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('wordlist')
parser.add_argument('word_length', type=int)
parser.add_argument('valid_answer_freq_min', type=int)
parser.add_argument('valid_input_freq_min', type=int)
parser.add_argument('allowedtypelist')
return parser.parse_args()
def main(args):
""" Entry point for script """
with open(args.allowedtypelist) as fp:
allowed_types = fp.read().split('\n')
types = set()
with open(args.wordlist) as fp:
src_words = [ WordListItem(line) for line in fp.read().strip().lower().split('\n') ]
src_words = [ word for word in src_words if word.word.isalpha() ]
src_words = [ word for word in src_words if len(word.word) == args.word_length ]
src_words = [ word for word in src_words if word.pos in allowed_types ]
[ types.add(word.pos) for word in src_words ]
words = {}
words['valid_answers'] = [ w.word for w in src_words if w.freq >= args.valid_answer_freq_min ]
words['valid_inputs'] = [ w.word for w in src_words if w.freq >= args.valid_input_freq_min ]
# remove duplicates
print(f"wordlist = {json.dumps(words)}")
print(f"{args=}", file=sys.stderr)
print(f"{len(words['valid_answers'])=}", file=sys.stderr)
print(f"{len(words['valid_inputs'])=}", file=sys.stderr)
print(f"{types=}", file=sys.stderr)
return 0
if __name__ == '__main__':
try:
sys.exit(main(get_args()))
except KeyboardInterrupt:
sys.exit(0)