update wordlist generator
This commit is contained in:
parent
30965c20ba
commit
16019e8d5b
@ -7,3 +7,4 @@ verb
|
|||||||
pron
|
pron
|
||||||
conj
|
conj
|
||||||
noc
|
noc
|
||||||
|
@
|
||||||
|
@ -3,6 +3,18 @@
|
|||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
class WordListItem:
|
||||||
|
def __init__(self, line):
|
||||||
|
word = line.split('\t')
|
||||||
|
|
||||||
|
self.word = word[1] if word[1].isalpha() else word[3]
|
||||||
|
self.pos = word[2]
|
||||||
|
self.frequency = int(word[4])
|
||||||
|
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<WordListItem {self.word=} {self.pos=} {self.frequency=}>"
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
""" Get command line arguments """
|
""" Get command line arguments """
|
||||||
@ -24,19 +36,22 @@ def main(args):
|
|||||||
|
|
||||||
types = set()
|
types = set()
|
||||||
with open(args.wordlist) as fp:
|
with open(args.wordlist) as fp:
|
||||||
words = [ (word[1], int(word[4]), word[2]) for word in [ word.lower().split('\t') for word in fp.read().strip().split('\n') ] ]
|
words = [ WordListItem(line) for line in fp.read().strip().lower().split('\n') ]
|
||||||
|
|
||||||
[ types.add(word[2]) for word in words ]
|
[ types.add(word.pos) for word in words ]
|
||||||
|
|
||||||
words = [ word[0] for word in words if word[1] >= args.frequency_min and word[0].isalpha() and len(word[0]) == args.word_length and word[2] in allowed_types ]
|
words = [ word.word for word in words if word.frequency >= args.frequency_min and word.word.isalpha() and len(word.word) == args.word_length and word.pos in allowed_types ]
|
||||||
|
|
||||||
words.sort(key=lambda word: word[1])
|
words.sort()
|
||||||
# remove duplicates
|
# remove duplicates
|
||||||
words = list(set(words))
|
words = list(set(words))
|
||||||
print(f"wordlist = {json.dumps(words)}")
|
print(f"wordlist = {json.dumps(words)}")
|
||||||
print(f"{args}", file=sys.stderr)
|
print(f"{args=}", file=sys.stderr)
|
||||||
print(f"{len(words)=}", file=sys.stderr)
|
print(f"{len(words)=}", file=sys.stderr)
|
||||||
print(f"{types=}", file=sys.stderr)
|
print(f"{types=}", file=sys.stderr)
|
||||||
|
print(f"{'cares' in words=}", file=sys.stderr)
|
||||||
|
print(f"{'ideas' in words=}", file=sys.stderr)
|
||||||
|
print(f"{'prose' in words=}", file=sys.stderr)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user