words/scripts/gen_wordlist.py

49 lines
1.3 KiB
Python
Raw Normal View History

2022-01-27 18:36:45 +00:00
#!/usr/bin/env python3
import sys
import json
def get_args():
""" Get command line arguments """
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('wordlist')
parser.add_argument('word_length', type=int)
2022-01-27 19:14:29 +00:00
parser.add_argument('frequency_min', type=int)
parser.add_argument('allowedtypelist')
2022-01-27 18:36:45 +00:00
return parser.parse_args()
def main(args):
""" Entry point for script """
with open(args.allowedtypelist) as fp:
allowed_types = fp.read().split('\n')
types = set()
2022-01-27 18:36:45 +00:00
with open(args.wordlist) as fp:
words = [ (word[1], int(word[4]), word[2]) for word in [ word.lower().split('\t') for word in fp.read().strip().split('\n') ] ]
[ types.add(word[2]) for word in words ]
2022-01-27 18:36:45 +00:00
words = [ word[0] for word in words if word[1] >= args.frequency_min and word[0].isalpha() and len(word[0]) == args.word_length and word[2] in allowed_types ]
2022-01-27 19:14:29 +00:00
words.sort(key=lambda word: word[1])
2022-01-27 20:16:33 +00:00
# remove duplicates
words = list(set(words))
2022-01-27 18:36:45 +00:00
print(f"wordlist = {json.dumps(words)}")
2022-01-27 19:14:29 +00:00
print(f"{args}", file=sys.stderr)
print(f"{len(words)=}", file=sys.stderr)
print(f"{types=}", file=sys.stderr)
2022-01-27 18:36:45 +00:00
return 0
if __name__ == '__main__':
try:
sys.exit(main(get_args()))
except KeyboardInterrupt:
sys.exit(0)