From 16019e8d5b18c232254d79be979e8ecfdcbecbfc Mon Sep 17 00:00:00 2001 From: Alvie Rahman Date: Sun, 30 Jan 2022 22:05:51 +0000 Subject: [PATCH] update wordlist generator --- allowed_types | 1 + scripts/gen_wordlist.py | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/allowed_types b/allowed_types index 9dabaa0..4a5eae9 100644 --- a/allowed_types +++ b/allowed_types @@ -7,3 +7,4 @@ verb pron conj noc +@ diff --git a/scripts/gen_wordlist.py b/scripts/gen_wordlist.py index 317970a..6708cc6 100755 --- a/scripts/gen_wordlist.py +++ b/scripts/gen_wordlist.py @@ -3,6 +3,18 @@ import sys import json +class WordListItem: + def __init__(self, line): + word = line.split('\t') + + self.word = word[1] if word[1].isalpha() else word[3] + self.pos = word[2] + self.frequency = int(word[4]) + + + def __repr__(self): + return f"" + def get_args(): """ Get command line arguments """ @@ -24,19 +36,22 @@ def main(args): types = set() with open(args.wordlist) as fp: - words = [ (word[1], int(word[4]), word[2]) for word in [ word.lower().split('\t') for word in fp.read().strip().split('\n') ] ] + words = [ WordListItem(line) for line in fp.read().strip().lower().split('\n') ] - [ types.add(word[2]) for word in words ] + [ types.add(word.pos) for word in words ] - words = [ word[0] for word in words if word[1] >= args.frequency_min and word[0].isalpha() and len(word[0]) == args.word_length and word[2] in allowed_types ] + words = [ word.word for word in words if word.frequency >= args.frequency_min and word.word.isalpha() and len(word.word) == args.word_length and word.pos in allowed_types ] - words.sort(key=lambda word: word[1]) + words.sort() # remove duplicates words = list(set(words)) print(f"wordlist = {json.dumps(words)}") - print(f"{args}", file=sys.stderr) + print(f"{args=}", file=sys.stderr) print(f"{len(words)=}", file=sys.stderr) print(f"{types=}", file=sys.stderr) + print(f"{'cares' in words=}", file=sys.stderr) + print(f"{'ideas' in words=}", file=sys.stderr) + print(f"{'prose' in words=}", file=sys.stderr) return 0