commit 5f21b2c1dc859024b4789eaae25ec5cf6990178d Author: Alvie Rahman Date: Sun Mar 7 14:40:58 2021 +0000 first commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..911d327 --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +include config.mk + +install: + mkdir -p ${DESTDIR}${PREFIX}/bin/ + cp -f otfm-python ${DESTDIR}${PREFIX}/bin/ + chmod 755 ${DESTDIR}${PREFIX}/bin/ + +uninstall: + rm -f ${DESTDIR}${PREFIX}/bin/otfm-python diff --git a/config.mk b/config.mk new file mode 100644 index 0000000..a67a797 --- /dev/null +++ b/config.mk @@ -0,0 +1 @@ +PREFIX=/usr/local diff --git a/macros b/macros new file mode 100644 index 0000000..63cb96e --- /dev/null +++ b/macros @@ -0,0 +1,8 @@ +.hc hydrocarbon +.hy hydrogen +.ca carbon +.ox oxygen +.wink 😉 +source tests/test_macros_biology +source tests/test_macros_custom_plurals +source tests/test_macros_plural diff --git a/otfm-python b/otfm-python new file mode 100755 index 0000000..d5a5736 --- /dev/null +++ b/otfm-python @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 + +import sys +import re + + +class MultipleTokens(): + """ + Used by process() to tell detokenize() that a macro adds extra tokens without modifying + without changing the indexes of other tokens + """ + + def __init__(self, words): + self.words = words + + +def get_args(): + """ Get command line arguments """ + + import argparse + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-m", "--macros-file", default="macros", + help="File where macros are stored") + parser.add_argument("-i", "--input", help="File to be processed.", default="-") + parser.add_argument("-o", "--output", help="Path of output", default="-") + return parser.parse_args() + + +def pluralize(input): + """ Returns the plural form of a word. """ + if isinstance(input, list): + # use custom plural if defined + if len(input) > 1: + return input[1] + + return pluralize_word(input[0]) + + return pluralize_word(input) + +def pluralize_word(word): + def is_vowel(letter): + if not isinstance(letter, str): + raise ValueError("Argument 'letter' must be type str") + if len(letter) != 1: + raise ValueError("Argument 'letter' must be 1 long") + return letter in 'aeiou' + + # TODO add more complex plural forms + if word[-1] in 'sxz' or word[-2:] in ['ch', 'sh']: + return word + 'es' + if word[-1] == 'y': + if not is_vowel(word[-2]): + return word[:-1] + 'ies' + if word[-1] == 'o': + if not is_vowel(word[-2]): + return word + 'es' + if word[-1] == 'f': + return word[:-1] + 'ves' + return word + 's' + + +def upper_check(token, word): + """ Check if word needs to be capitalized and capitalise appropriately if that is the case. """ + all_caps = True + + for letter in token: + if letter.islower(): + all_caps = False + break + + if all_caps: + return word.upper() + + if len(token) > 1: + if token[1].isupper(): + return word[:1].upper() + word[1:] + + return word + + +def process(input, macros): + """ + This function takes the string `input` and a dict, ` macros`. + It substitutes any keys in `macro` with the corresponding value. + It also checks for any otf macros defined in the string and appends them to `macros`, + replacing that otf macro and any following instances of it. + It returns the substituted string. + """ + tokens = tokenize(input) + macros = macros + + in_otf_macro = False + tmp_macro_keyword = None + tmp_macro_definition = [] + + for line_number, line in enumerate(tokens): + for token_number, token in enumerate(line): + if len(token) == 0: + continue + + # detect on the fly macros + token_is_otf_macro_start = is_otf_macro_start(token, line) + + # process otf macro tokens + if token_is_otf_macro_start: + tmp_macro_keyword = token + in_otf_macro = True + tmp_macro_definition = [] + tokens[line_number][token_number] = None + continue + elif in_otf_macro and is_otf_macro_end(token): + split_token = re.split(r',.|.,', token) + tmp_macro_definition.append(split_token[0]) + macros[tmp_macro_keyword] = ' '.join(tmp_macro_definition) + token = tmp_macro_keyword + split_token[1] + in_otf_macro = False + # once the end of the macro has been found and stored, continue downn the for loop + # so that it can be turned back to normal text + elif in_otf_macro: + tmp_macro_definition.append(token) + tokens[line_number][token_number] = None + continue + + # cutting off the end and then adding it back once expanded + # e.g. punctuation: from the token "hello...", end would be equal to "..." + # and token would be equal to "hello" + end = [] + token = list(token) + for index, char in reversed(list(enumerate(token))): + if not char.isalnum(): + end.insert(0, token.pop(index)) + else: + break + end = ''.join(end) + token = ''.join(token) + + # if no macro is found (or if it is not a macro at all, the value + # will not be changed + value = token + + if token.lower() in macros.keys(): + value = macros[token.lower()][0] + elif token.lower() in [f"{m}s" for m in macros.keys()]: + value = pluralize(macros[token.lower()[:-1]]) + + tokens[line_number][token_number] = upper_check(token, value) + tokens[line_number][token_number] += end + + # filter out None tokens + tokens[line_number] = [token for token in tokens[line_number] if token is not None] + + return detokenize(tokens) + +def tokenize(input): + """ + Returns a 2D list of tokens and a list of otf_macros. + otf macro definitions are removed and just the keyword definition is kept as well as any + punctuation on the final word. + """ + return [x.split(' ') for x in input.split('\n')] + + +def detokenize(tokens): + """Turn a list of tokens into plaintext. """ + + output = [] + + for index, line in enumerate(tokens): + output.append([]) + for token in line: + if isinstance(token, MultipleTokens): + for word in token.words: + output[index].append(word) + elif isinstance(token, str): + output[index].append(token) + else: + raise ValueError(f"Unknown token type: {type(token)}") + + for line_number, line in enumerate(output): + output[line_number] = ' '.join(line) + + return '\n'.join(output) + + +def get_macros(input, child=False): + """ Turn a macros string into a list of tuples of macros """ + response = {} + + # turn input into list of tuples + macros = [re.split('[\t]', x) for x in input.split('\n')] + + # check if keyword is `source`, get macros from sourced file if it is + for index, macro in enumerate(macros): + if macro[0] == "source": + with open(macro[1]) as file: + macros += get_macros(file.read(), child=True) + macros[index] = () + + if child: + return macros + + # store macros as dict and return + for index, macro in enumerate(macros): + if len(macro) >= 2: + response[macro[0].lower()] = macro[1:] + return response + +def is_otf_macro_start(token, line): + """ Returns true if token is the start of an on the fly macro """ + match = re.search(r'^\.[A-Za-z0-9]+$', token) + if match is None: + return False + + # don't return true you can't find an end token in the line + for line_token in line: + if is_otf_macro_end(line_token): + return match is not None + + return False + + +def is_otf_macro_end(token): + """ Returns true if token is the end of an on the fly macro """ + match = re.search(r'(\.,|,\.)', f"{token}") + return match is not None + + +def main(args): + """ Entry point for script """ + + # get macros + + with open(args.macros_file) as file: + macros = get_macros(file.read()) + + # get tokens (file contents) + if args.input == "-": + input = sys.stdin.read() + else: + with open(args.input) as file: + input = file.read() + + if args.output == "-": + return print(process(input, macros)) + else: + with open(args.output, 'w+') as file: + return file.write(process(input, macros)) + + +if __name__ == '__main__': + try: + sys.exit(main(get_args())) + except KeyboardInterrupt: + sys.exit(0) diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..d599d8d --- /dev/null +++ b/readme.md @@ -0,0 +1,24 @@ +# pymacro + +A python implementation of [otfmacros](https://github.com/alvierahman90/otfmacros). + +## usage +``` +$ ./pymacro -h +usage: pymacro [-h] [-m MACROS_FILE] [-i INPUT] [-o OUTPUT] + +optional arguments: + -h, --help show this help message and exit + -m MACROS_FILE, --macros-file MACROS_FILE + File where macros are stored (default: macros) + -i INPUT, --input INPUT + File to be processed. (default: -) + -o OUTPUT, --output OUTPUT + Path of output (default: -) +``` + +## testing + +Run `test.sh`. +A `diff` is run on the actual output against what should have come out according +to the spec. diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..0717231 --- /dev/null +++ b/test.sh @@ -0,0 +1,3 @@ +#/usr/bin/env sh +cat tests/test_input | ./pymacro > tests/test_actual_output +diff tests/test_actual_output tests/test_expected_output diff --git a/tests/test_expected_output b/tests/test_expected_output new file mode 100644 index 0000000..00f1abf --- /dev/null +++ b/tests/test_expected_output @@ -0,0 +1,12 @@ +Hydrocarbons are composed of exclusively hydrogen and carbon. + +Chlorophyll is the site of photosynthesis. + +😉 + +1 hydrocarbon 2 hydrocarbons +1 dress 2 dresses +1 story 2 stories +1 hero 2 heroes +1 leaf 2 leaves +1 man 2 men diff --git a/tests/test_input b/tests/test_input new file mode 100644 index 0000000..babbe4d --- /dev/null +++ b/tests/test_input @@ -0,0 +1,21 @@ +.Hcs are composed of exclusively .hy and .ca. + +.Chl is the site of .ps. + +.wink + +1 .hc 2 .hcs +1 .dr 2 .drs +1 .st 2 .sts +1 .he 2 .hes +1 .le 2 .les +1 .ma 2 .mas + +This is a test of .otfm on the fly macro.,s! + +If this sentence makes sense, then the test of .otfms worked! + +.otfms can also be overwritten, +you could make it equal .otfm on the fly monkey.,s! + +They're not just any monkeys, they're .otfms! diff --git a/tests/test_macros_biology b/tests/test_macros_biology new file mode 100644 index 0000000..3d40842 --- /dev/null +++ b/tests/test_macros_biology @@ -0,0 +1,2 @@ +.chl chlorophyll +.ps photosynthesis diff --git a/tests/test_macros_custom_plurals b/tests/test_macros_custom_plurals new file mode 100644 index 0000000..01e66e6 --- /dev/null +++ b/tests/test_macros_custom_plurals @@ -0,0 +1 @@ +.l louse lice diff --git a/tests/test_macros_plural b/tests/test_macros_plural new file mode 100644 index 0000000..80b1b34 --- /dev/null +++ b/tests/test_macros_plural @@ -0,0 +1,6 @@ +.hc hydrocarbon +.dr dress +.st story +.he hero +.le leaf +.ma man men