first commit

This commit is contained in:
Akbar Rahman 2021-03-07 14:40:58 +00:00
commit 5f21b2c1dc
11 changed files with 341 additions and 0 deletions

9
Makefile Normal file
View File

@ -0,0 +1,9 @@
include config.mk
install:
mkdir -p ${DESTDIR}${PREFIX}/bin/
cp -f otfm-python ${DESTDIR}${PREFIX}/bin/
chmod 755 ${DESTDIR}${PREFIX}/bin/
uninstall:
rm -f ${DESTDIR}${PREFIX}/bin/otfm-python

1
config.mk Normal file
View File

@ -0,0 +1 @@
PREFIX=/usr/local

8
macros Normal file
View File

@ -0,0 +1,8 @@
.hc hydrocarbon
.hy hydrogen
.ca carbon
.ox oxygen
.wink 😉
source tests/test_macros_biology
source tests/test_macros_custom_plurals
source tests/test_macros_plural

254
otfm-python Executable file
View File

@ -0,0 +1,254 @@
#!/usr/bin/env python3
import sys
import re
class MultipleTokens():
"""
Used by process() to tell detokenize() that a macro adds extra tokens without modifying
without changing the indexes of other tokens
"""
def __init__(self, words):
self.words = words
def get_args():
""" Get command line arguments """
import argparse
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-m", "--macros-file", default="macros",
help="File where macros are stored")
parser.add_argument("-i", "--input", help="File to be processed.", default="-")
parser.add_argument("-o", "--output", help="Path of output", default="-")
return parser.parse_args()
def pluralize(input):
""" Returns the plural form of a word. """
if isinstance(input, list):
# use custom plural if defined
if len(input) > 1:
return input[1]
return pluralize_word(input[0])
return pluralize_word(input)
def pluralize_word(word):
def is_vowel(letter):
if not isinstance(letter, str):
raise ValueError("Argument 'letter' must be type str")
if len(letter) != 1:
raise ValueError("Argument 'letter' must be 1 long")
return letter in 'aeiou'
# TODO add more complex plural forms
if word[-1] in 'sxz' or word[-2:] in ['ch', 'sh']:
return word + 'es'
if word[-1] == 'y':
if not is_vowel(word[-2]):
return word[:-1] + 'ies'
if word[-1] == 'o':
if not is_vowel(word[-2]):
return word + 'es'
if word[-1] == 'f':
return word[:-1] + 'ves'
return word + 's'
def upper_check(token, word):
""" Check if word needs to be capitalized and capitalise appropriately if that is the case. """
all_caps = True
for letter in token:
if letter.islower():
all_caps = False
break
if all_caps:
return word.upper()
if len(token) > 1:
if token[1].isupper():
return word[:1].upper() + word[1:]
return word
def process(input, macros):
"""
This function takes the string `input` and a dict, ` macros`.
It substitutes any keys in `macro` with the corresponding value.
It also checks for any otf macros defined in the string and appends them to `macros`,
replacing that otf macro and any following instances of it.
It returns the substituted string.
"""
tokens = tokenize(input)
macros = macros
in_otf_macro = False
tmp_macro_keyword = None
tmp_macro_definition = []
for line_number, line in enumerate(tokens):
for token_number, token in enumerate(line):
if len(token) == 0:
continue
# detect on the fly macros
token_is_otf_macro_start = is_otf_macro_start(token, line)
# process otf macro tokens
if token_is_otf_macro_start:
tmp_macro_keyword = token
in_otf_macro = True
tmp_macro_definition = []
tokens[line_number][token_number] = None
continue
elif in_otf_macro and is_otf_macro_end(token):
split_token = re.split(r',.|.,', token)
tmp_macro_definition.append(split_token[0])
macros[tmp_macro_keyword] = ' '.join(tmp_macro_definition)
token = tmp_macro_keyword + split_token[1]
in_otf_macro = False
# once the end of the macro has been found and stored, continue downn the for loop
# so that it can be turned back to normal text
elif in_otf_macro:
tmp_macro_definition.append(token)
tokens[line_number][token_number] = None
continue
# cutting off the end and then adding it back once expanded
# e.g. punctuation: from the token "hello...", end would be equal to "..."
# and token would be equal to "hello"
end = []
token = list(token)
for index, char in reversed(list(enumerate(token))):
if not char.isalnum():
end.insert(0, token.pop(index))
else:
break
end = ''.join(end)
token = ''.join(token)
# if no macro is found (or if it is not a macro at all, the value
# will not be changed
value = token
if token.lower() in macros.keys():
value = macros[token.lower()][0]
elif token.lower() in [f"{m}s" for m in macros.keys()]:
value = pluralize(macros[token.lower()[:-1]])
tokens[line_number][token_number] = upper_check(token, value)
tokens[line_number][token_number] += end
# filter out None tokens
tokens[line_number] = [token for token in tokens[line_number] if token is not None]
return detokenize(tokens)
def tokenize(input):
"""
Returns a 2D list of tokens and a list of otf_macros.
otf macro definitions are removed and just the keyword definition is kept as well as any
punctuation on the final word.
"""
return [x.split(' ') for x in input.split('\n')]
def detokenize(tokens):
"""Turn a list of tokens into plaintext. """
output = []
for index, line in enumerate(tokens):
output.append([])
for token in line:
if isinstance(token, MultipleTokens):
for word in token.words:
output[index].append(word)
elif isinstance(token, str):
output[index].append(token)
else:
raise ValueError(f"Unknown token type: {type(token)}")
for line_number, line in enumerate(output):
output[line_number] = ' '.join(line)
return '\n'.join(output)
def get_macros(input, child=False):
""" Turn a macros string into a list of tuples of macros """
response = {}
# turn input into list of tuples
macros = [re.split('[\t]', x) for x in input.split('\n')]
# check if keyword is `source`, get macros from sourced file if it is
for index, macro in enumerate(macros):
if macro[0] == "source":
with open(macro[1]) as file:
macros += get_macros(file.read(), child=True)
macros[index] = ()
if child:
return macros
# store macros as dict and return
for index, macro in enumerate(macros):
if len(macro) >= 2:
response[macro[0].lower()] = macro[1:]
return response
def is_otf_macro_start(token, line):
""" Returns true if token is the start of an on the fly macro """
match = re.search(r'^\.[A-Za-z0-9]+$', token)
if match is None:
return False
# don't return true you can't find an end token in the line
for line_token in line:
if is_otf_macro_end(line_token):
return match is not None
return False
def is_otf_macro_end(token):
""" Returns true if token is the end of an on the fly macro """
match = re.search(r'(\.,|,\.)', f"{token}")
return match is not None
def main(args):
""" Entry point for script """
# get macros
with open(args.macros_file) as file:
macros = get_macros(file.read())
# get tokens (file contents)
if args.input == "-":
input = sys.stdin.read()
else:
with open(args.input) as file:
input = file.read()
if args.output == "-":
return print(process(input, macros))
else:
with open(args.output, 'w+') as file:
return file.write(process(input, macros))
if __name__ == '__main__':
try:
sys.exit(main(get_args()))
except KeyboardInterrupt:
sys.exit(0)

24
readme.md Normal file
View File

@ -0,0 +1,24 @@
# pymacro
A python implementation of [otfmacros](https://github.com/alvierahman90/otfmacros).
## usage
```
$ ./pymacro -h
usage: pymacro [-h] [-m MACROS_FILE] [-i INPUT] [-o OUTPUT]
optional arguments:
-h, --help show this help message and exit
-m MACROS_FILE, --macros-file MACROS_FILE
File where macros are stored (default: macros)
-i INPUT, --input INPUT
File to be processed. (default: -)
-o OUTPUT, --output OUTPUT
Path of output (default: -)
```
## testing
Run `test.sh`.
A `diff` is run on the actual output against what should have come out according
to the spec.

3
test.sh Executable file
View File

@ -0,0 +1,3 @@
#/usr/bin/env sh
cat tests/test_input | ./pymacro > tests/test_actual_output
diff tests/test_actual_output tests/test_expected_output

View File

@ -0,0 +1,12 @@
Hydrocarbons are composed of exclusively hydrogen and carbon.
Chlorophyll is the site of photosynthesis.
😉
1 hydrocarbon 2 hydrocarbons
1 dress 2 dresses
1 story 2 stories
1 hero 2 heroes
1 leaf 2 leaves
1 man 2 men

21
tests/test_input Normal file
View File

@ -0,0 +1,21 @@
.Hcs are composed of exclusively .hy and .ca.
.Chl is the site of .ps.
.wink
1 .hc 2 .hcs
1 .dr 2 .drs
1 .st 2 .sts
1 .he 2 .hes
1 .le 2 .les
1 .ma 2 .mas
This is a test of .otfm on the fly macro.,s!
If this sentence makes sense, then the test of .otfms worked!
.otfms can also be overwritten,
you could make it equal .otfm on the fly monkey.,s!
They're not just any monkeys, they're .otfms!

View File

@ -0,0 +1,2 @@
.chl chlorophyll
.ps photosynthesis

View File

@ -0,0 +1 @@
.l louse lice

6
tests/test_macros_plural Normal file
View File

@ -0,0 +1,6 @@
.hc hydrocarbon
.dr dress
.st story
.he hero
.le leaf
.ma man men