remove names and other weird words from wordlist
This commit is contained in:
@@ -12,15 +12,23 @@ def get_args():
|
||||
parser.add_argument('wordlist')
|
||||
parser.add_argument('word_length', type=int)
|
||||
parser.add_argument('frequency_min', type=int)
|
||||
parser.add_argument('allowedtypelist')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
""" Entry point for script """
|
||||
with open(args.wordlist) as fp:
|
||||
words = [ (word[1], int(word[4])) for word in [ word.lower().split('\t') for word in fp.read().strip().split('\n') ] ]
|
||||
|
||||
words = [ word[0] for word in words if word[1] > args.frequency_min and word[0].isalpha() and len(word[0]) == args.word_length ]
|
||||
with open(args.allowedtypelist) as fp:
|
||||
allowed_types = fp.read().split('\n')
|
||||
|
||||
types = set()
|
||||
with open(args.wordlist) as fp:
|
||||
words = [ (word[1], int(word[4]), word[2]) for word in [ word.lower().split('\t') for word in fp.read().strip().split('\n') ] ]
|
||||
|
||||
[ types.add(word[2]) for word in words ]
|
||||
|
||||
words = [ word[0] for word in words if word[1] >= args.frequency_min and word[0].isalpha() and len(word[0]) == args.word_length and word[2] in allowed_types ]
|
||||
|
||||
words.sort(key=lambda word: word[1])
|
||||
# remove duplicates
|
||||
@@ -28,6 +36,7 @@ def main(args):
|
||||
print(f"wordlist = {json.dumps(words)}")
|
||||
print(f"{args}", file=sys.stderr)
|
||||
print(f"{len(words)=}", file=sys.stderr)
|
||||
print(f"{types=}", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
|
Reference in New Issue
Block a user