words/scripts/gen_wordlist.py

64 lines
1.7 KiB
Python
Raw Normal View History

2022-01-27 18:36:45 +00:00
#!/usr/bin/env python3
import sys
import json
2022-01-30 22:05:51 +00:00
class WordListItem:
def __init__(self, line):
word = line.split('\t')
self.word = word[1] if word[1].isalpha() else word[3]
self.pos = word[2]
self.frequency = int(word[4])
def __repr__(self):
return f"<WordListItem {self.word=} {self.pos=} {self.frequency=}>"
2022-01-27 18:36:45 +00:00
def get_args():
""" Get command line arguments """
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('wordlist')
parser.add_argument('word_length', type=int)
2022-01-27 19:14:29 +00:00
parser.add_argument('frequency_min', type=int)
parser.add_argument('allowedtypelist')
2022-01-27 18:36:45 +00:00
return parser.parse_args()
def main(args):
""" Entry point for script """
with open(args.allowedtypelist) as fp:
allowed_types = fp.read().split('\n')
types = set()
2022-01-27 18:36:45 +00:00
with open(args.wordlist) as fp:
2022-01-30 22:05:51 +00:00
words = [ WordListItem(line) for line in fp.read().strip().lower().split('\n') ]
2022-01-30 22:05:51 +00:00
[ types.add(word.pos) for word in words ]
2022-01-27 18:36:45 +00:00
2022-01-30 22:05:51 +00:00
words = [ word.word for word in words if word.frequency >= args.frequency_min and word.word.isalpha() and len(word.word) == args.word_length and word.pos in allowed_types ]
2022-01-27 19:14:29 +00:00
2022-01-30 22:05:51 +00:00
words.sort()
2022-01-27 20:16:33 +00:00
# remove duplicates
words = list(set(words))
2022-01-27 18:36:45 +00:00
print(f"wordlist = {json.dumps(words)}")
2022-01-30 22:05:51 +00:00
print(f"{args=}", file=sys.stderr)
2022-01-27 19:14:29 +00:00
print(f"{len(words)=}", file=sys.stderr)
print(f"{types=}", file=sys.stderr)
2022-01-30 22:05:51 +00:00
print(f"{'cares' in words=}", file=sys.stderr)
print(f"{'ideas' in words=}", file=sys.stderr)
print(f"{'prose' in words=}", file=sys.stderr)
2022-01-27 18:36:45 +00:00
return 0
if __name__ == '__main__':
try:
sys.exit(main(get_args()))
except KeyboardInterrupt:
sys.exit(0)