This repository has been archived on 2024-01-02. You can view files and clone it, but cannot push or open issues or pull requests.
notes2web/gronk.py

553 lines
17 KiB
Python
Raw Permalink Normal View History

2021-06-29 13:35:21 +00:00
#!/usr/bin/env python3
2024-01-02 18:22:15 +00:00
"""
2023-09-17 19:47:05 +00:00
gronk --- view your notes as a static html site
2024-01-02 18:22:15 +00:00
"""
2021-06-29 13:35:21 +00:00
2024-01-02 18:22:15 +00:00
import argparse
2021-06-29 13:35:21 +00:00
import os
2024-01-02 18:22:15 +00:00
from pathlib import Path
import shutil
import sys
2024-01-02 03:49:49 +00:00
import subprocess
import copy
import time
import magic
import regex as re
2024-01-02 18:22:15 +00:00
import pprint
import frontmatter
import jinja2
import requests
2023-09-17 19:47:05 +00:00
GRONK_COMMIT = "dev"
2024-01-02 18:22:15 +00:00
PANDOC_SERVER_URL = os.getenv("PANDOC_SERVER_URL", r"http://localhost:3030/")
PANDOC_TIMEOUT = int(os.getenv("PANDOC_TIMEOUT", "120"))
GRONK_CSS_DIR = Path(os.getenv("GRONK_CSS_DIR", "/opt/gronk/css"))
GRONK_JS_DIR = Path(os.getenv("GRONK_JS_DIR", "/opt/gronk/js"))
GRONK_TEMPLATES_DIR = Path(
os.getenv("GRONK_TEMPLATES_DIR", "/opt/gronk/templates/"))
JINJA_ENV = jinja2.Environment(
loader=jinja2.FileSystemLoader(searchpath=GRONK_TEMPLATES_DIR),
autoescape=jinja2.select_autoescape)
2022-11-11 12:06:46 +00:00
2024-01-02 03:50:43 +00:00
JINJA_TEMPLATE_TEXTARTICLE = JINJA_ENV.get_template("article-text.html")
JINJA_TEMPLATE_HOME_INDEX = JINJA_ENV.get_template("home.html")
2023-09-17 19:18:24 +00:00
JINJA_TEMPLATE_INDEX = JINJA_ENV.get_template("index.html")
2024-01-02 18:22:15 +00:00
JINJA_TEMPLATE_ARTICLE = JINJA_ENV.get_template("article.html")
2024-01-02 03:50:43 +00:00
JINJA_TEMPLATE_PERMALINK = JINJA_ENV.get_template("permalink.html")
2022-11-11 12:06:46 +00:00
2024-01-02 18:22:15 +00:00
LICENSE = None
FILEMAP = None
2021-09-01 13:08:24 +00:00
2024-01-02 03:49:49 +00:00
class FileMap:
"""
this class is used to read file properties, inherit properties,
and have a centralised place to access them
"""
def __init__(self, input_dir, output_dir):
self._map = {}
self.input_dir = Path(input_dir)
self.output_dir = Path(output_dir)
@staticmethod
def _path_to_key(path):
return str(path)
@staticmethod
def is_plaintext(filename):
return re.match(r'^text/', magic.from_file(str(filename),
mime=True)) is not None
def add(self, filepath):
filepath = Path(filepath)
if filepath.is_dir():
properties = self._get_directory_properties(filepath)
else:
properties = self._get_file_properties(filepath)
properties['src_path'] = filepath
properties['dst_path'] = self._get_output_filepath(filepath)
self._map[self._path_to_key(filepath)] = properties
def get(self, filepath, default=None, raw=False):
"""
get the properties of a file at a filepath
raw=True to not inherit properties
"""
# TODO maybe store properties of a file once it's in built and mark it
# as built? might save time but also cba
if self._path_to_key(filepath) not in self._map.keys():
self.add(filepath)
properties = copy.deepcopy(
self._map.get(self._path_to_key(filepath), default))
if raw:
return properties
parent = filepath
while True:
parent = parent.parent
if parent == Path('.'):
break
parent_properties = self.get(parent, raw=True)
# TODO inherit any property that isn't defined, append any lists
# that exist
properties['tags'] = properties.get(
'tags', []) + parent_properties.get('tags', [])
if parent == self.input_dir:
break
return properties
def _get_directory_properties(self,
filepath: Path,
include_index_entries=True):
post = {
'title': filepath.name,
'content_after_search': False,
'automatic_index': True,
'search_bar': True,
'tags': [],
}
2024-01-02 18:20:47 +00:00
if 'readme.md' in [f.name for f in filepath.iterdir()]:
with open(filepath.joinpath('readme.md'),
2024-01-02 03:49:49 +00:00
encoding='utf-8') as file_pointer:
for key, val in frontmatter.load(
file_pointer).to_dict().items():
post[key] = val
if 'content' in post.keys():
post['content'] = render_markdown(post['content'])
post['is_dir'] = True
if include_index_entries:
post['index_entries'] = self._get_index_entries(filepath)
return post
def _get_index_entries(self, filepath):
entries = []
for path in filepath.iterdir():
if '.git' in path.parts:
continue
2024-01-02 03:49:49 +00:00
if path.is_dir():
entry = self._get_directory_properties(
path, include_index_entries=False)
else:
entry = self._get_file_properties(path)
entry['path'] = self._get_output_filepath(path)['web']
entries.append(entry)
entries.sort(key=lambda entry: str(entry.get('title', '')).lower())
entries.sort(key=lambda entry: entry['is_dir'], reverse=True)
return entries
def _get_file_properties(self, filepath):
post = {'title': filepath.name}
if filepath.suffix == '.md':
with open(filepath, encoding='utf-8') as file_pointer:
post = frontmatter.load(file_pointer).to_dict()
# don't store file contents in memory
if 'content' in post.keys():
del post['content']
post['is_dir'] = False
return post
def _get_output_filepath(self, input_filepath):
def webpath(filepath):
return Path('/notes').joinpath(
filepath.relative_to(self.output_dir))
r = {}
r['raw'] = self.output_dir.joinpath(
input_filepath.relative_to(self.input_dir))
r['web'] = webpath(r['raw'])
if input_filepath.is_dir():
return r
if input_filepath.suffix == '.md':
r['html'] = self.output_dir.joinpath(
input_filepath.relative_to(
self.input_dir)).with_suffix('.html')
r['web'] = webpath(r['html'])
elif self.is_plaintext(input_filepath):
r['html'] = self.output_dir.joinpath(
input_filepath.relative_to(
self.input_dir)).with_suffix(input_filepath.suffix +
'.html')
r['raw'] = self.output_dir.joinpath(
input_filepath.relative_to(self.input_dir))
r['web'] = webpath(r['html'])
return r
def to_list(self):
return [val for _, val in self._map.items()]
def to_search_data(self):
"""
returns list of every file in map
"""
r = []
for _, val in self._map.items():
r.append({
'title': val.get('title', ''),
'tags': val.get('tags', []),
'path': str(val['dst_path']['web']),
'is_dir': val['is_dir']
})
return r
def get_uuid_map(self):
d = {}
for _, val in self._map.items():
if 'uuid' not in val.keys():
continue
d[val['uuid']] = str(val['dst_path']['web'])
return d
2024-01-02 18:22:15 +00:00
def update_required(src_filepath, output_filepath):
"""
check if file requires an update,
return boolean
"""
2024-01-02 03:56:55 +00:00
return not output_filepath.exists() or src_filepath.stat(
).st_mtime > output_filepath.stat().st_mtimeme()
2021-08-21 01:14:12 +00:00
2024-01-02 18:22:15 +00:00
def get_args():
""" Get command line arguments """
2024-01-02 18:22:15 +00:00
parser = argparse.ArgumentParser()
parser.add_argument('notes', type=Path)
parser.add_argument('-o', '--output-dir', type=Path, default='web')
2024-01-02 03:56:55 +00:00
parser.add_argument(
'-F',
'--force',
action="store_true",
help=
"Generate new output html even if source file was modified before output html"
)
2024-01-02 18:22:15 +00:00
return parser.parse_args()
2024-01-02 18:22:15 +00:00
def render_markdown_file(input_filepath):
"""
render markdown file to file
write markdown file to args.output_dir in html,
return list of tuple of output filepath, frontmatter post
"""
with open(input_filepath, encoding='utf-8') as file_pointer:
content = frontmatter.load(file_pointer).content
2024-01-02 18:22:15 +00:00
properties = FILEMAP.get(input_filepath)
2021-10-19 19:19:54 +00:00
2024-01-02 18:22:15 +00:00
html = render_markdown(content)
2024-01-02 04:01:56 +00:00
html = JINJA_TEMPLATE_ARTICLE.render(
license=LICENSE,
content=html,
lecture_slides=properties.get("lecture_slides"),
lecture_notes=properties.get("lecture_notes"),
uuid=properties.get("uuid"),
tags=properties.get("tags"),
author=properties.get("author"),
title=properties.get("title"))
properties['dst_path']['html'].write_text(html)
2021-08-21 01:14:12 +00:00
2024-01-02 18:22:15 +00:00
def render_plaintext_file(input_filepath):
"""
render plaintext file to file
copy plaintext file into a html preview, copy raw to output dir
return list of tuple of output filepath, empty dict
"""
2021-06-29 13:35:21 +00:00
2024-01-02 03:56:55 +00:00
raw_content = input_filepath.read_text()
2024-01-02 18:22:15 +00:00
properties = FILEMAP.get(input_filepath)
2024-01-02 03:56:55 +00:00
html = JINJA_TEMPLATE_TEXTARTICLE.render(license=LICENSE, **properties)
properties['dst_path']['raw'].write_text(raw_content)
properties['dst_path']['html'].write_text(html)
2021-06-29 13:35:21 +00:00
2024-01-02 18:22:15 +00:00
def render_generic_file(input_filepath):
"""
render generic file to file
copy generic file into to output_dir
return list of tuple of output filepath, empty dict
"""
properties = FILEMAP.get(input_filepath)
output_filepath = properties['dst_path']['raw']
shutil.copyfile(input_filepath, output_filepath)
2021-06-29 13:35:21 +00:00
2024-01-02 18:22:15 +00:00
def render_file(input_filepath):
"""
render any file by detecting type and applying appropriate type
write input_filepath to correct file in args.output_dir in appropriate formats,
return list of tuples of output filepath, frontmatter post
"""
if input_filepath.suffix == '.md':
return render_markdown_file(input_filepath)
if FileMap.is_plaintext(input_filepath):
return render_plaintext_file(input_filepath)
return render_generic_file(input_filepath)
2021-06-29 13:35:21 +00:00
2024-01-02 18:22:15 +00:00
def render_markdown(content):
"""
render markdown to html
"""
2021-06-29 13:35:21 +00:00
2024-01-02 18:22:15 +00:00
post_body = {
2024-01-02 03:56:55 +00:00
'text': content,
'toc-depth': 6,
'highlight-style': 'pygments',
'html-math-method': 'mathml',
'to': 'html',
'files': {
'data/data/abbreviations': '',
},
'standalone': False,
}
headers = {'Accept': 'application/json'}
response = requests.post(PANDOC_SERVER_URL,
headers=headers,
json=post_body,
timeout=PANDOC_TIMEOUT)
2024-01-02 18:22:15 +00:00
response = response.json()
# TODO look at response['messages'] and log them maybe?
# https://github.com/jgm/pandoc/blob/main/doc/pandoc-server.md#response
return response['output']
2023-09-17 19:18:24 +00:00
def process_home_index(args, notes_git_head_sha1=None):
2024-01-02 18:22:15 +00:00
"""
create home index.html in output_dir
"""
2023-09-17 19:18:24 +00:00
2024-01-02 03:56:55 +00:00
post = {'title': 'gronk', 'content': ''}
2024-01-02 18:20:47 +00:00
custom_content_file = args.notes.joinpath('readme.md')
2023-09-17 19:18:24 +00:00
if custom_content_file.is_file():
fmpost = frontmatter.loads(custom_content_file.read_text()).to_dict()
for key, val in fmpost.items():
post[key] = val
post['content'] = render_markdown(post['content'])
2024-01-02 18:22:15 +00:00
html = JINJA_TEMPLATE_HOME_INDEX.render(
2024-01-02 03:56:55 +00:00
gronk_commit=GRONK_COMMIT,
search_data=FILEMAP.to_search_data(),
notes_git_head_sha1=notes_git_head_sha1,
post=post)
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
args.output_dir.joinpath('index.html').write_text(html)
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
2024-01-02 04:02:23 +00:00
def generate_permalink_page(output_dir):
"""
create the directory and index.html for redirecting permalinks
"""
dir = output_dir.joinpath('permalink')
dir.mkdir(exist_ok=True)
dir.joinpath('index.html').write_text(
JINJA_TEMPLATE_PERMALINK.render(gronk_commit=GRONK_COMMIT,
data=FILEMAP.get_uuid_map()))
def generate_tag_browser(output_dir):
2024-01-02 18:22:15 +00:00
"""
generate a directory that lets you groub by and browse by any given tag. e.g. tags, authors
"""
2023-09-17 19:18:24 +00:00
tags = {}
for post in FILEMAP.to_list():
post['path'] = post['dst_path']['web']
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
if 'tags' not in post.keys():
2024-01-02 18:22:15 +00:00
continue
2023-09-17 19:18:24 +00:00
for tag in post['tags']:
if tag not in tags.keys():
tags[tag] = []
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
tags[tag].append(post)
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
for tag, index_entries in tags.items():
output_file = output_dir.joinpath(tag, 'index.html')
output_file.parent.mkdir(exist_ok=True, parents=True)
2024-01-02 03:56:55 +00:00
output_file.write_text(
JINJA_TEMPLATE_INDEX.render(
gronk_commit=GRONK_COMMIT,
2023-09-17 19:18:24 +00:00
automatic_index=True,
search_bar=True,
title=tag,
2024-01-02 03:56:55 +00:00
index_entries=[{
'title': entry.get('title', ''),
'is_dir': entry.get('is_dir', False),
'path': str(entry.get('path', Path(''))),
} for entry in index_entries],
))
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
output_file = output_dir.joinpath('index.html')
output_file.parent.mkdir(exist_ok=True, parents=True)
2024-01-02 03:56:55 +00:00
output_file.write_text(
JINJA_TEMPLATE_INDEX.render(automatic_index=True,
gronk_commit=GRONK_COMMIT,
search_bar=True,
title='tags',
index_entries=[{
'path': tag,
'title': tag,
'is_dir': False,
} for tag in tags.keys()]))
2024-01-02 18:22:15 +00:00
def main(args):
""" Entry point for script """
start_time = time.time()
2024-01-02 18:22:15 +00:00
global LICENSE
global FILEMAP
FILEMAP = FileMap(args.notes, args.output_dir.joinpath('notes'))
2024-01-02 03:56:55 +00:00
# TODO have some sort of 'site rebuild in progress - come back in a minute
# or two!' or auto checking/refreshing page for when site is being built
2021-08-24 13:39:30 +00:00
if args.output_dir.is_file():
2021-08-24 12:52:47 +00:00
print(f"Output directory ({args.output_dir}) cannot be a file.")
2021-06-29 13:35:21 +00:00
2021-08-24 12:52:47 +00:00
args.output_dir.mkdir(parents=True, exist_ok=True)
2021-06-29 13:35:21 +00:00
2024-01-02 18:22:15 +00:00
# attempt to get licensing information
2021-09-01 17:26:18 +00:00
license_path = args.notes.joinpath("LICENSE")
if license_path.exists():
2024-01-02 03:56:55 +00:00
LICENSE = license_path.read_text()
2024-01-02 18:22:15 +00:00
# TODO git commit log integration
2024-01-02 03:56:55 +00:00
for root_str, _, files in os.walk(args.notes):
2024-01-02 18:22:15 +00:00
root = Path(root_str)
if '.git' in root.parts:
continue
2021-08-24 12:52:47 +00:00
2024-01-02 18:22:15 +00:00
root_properties = FILEMAP.get(root)
root_properties['dst_path']['raw'].mkdir(parents=True, exist_ok=True)
#pprint.pprint(root_properties)
2024-01-02 04:02:48 +00:00
html = JINJA_TEMPLATE_INDEX.render(
gronk_commit=GRONK_COMMIT,
title=root_properties.get('title', ''),
content=root_properties.get('content', ''),
content_after_search=root_properties['content_after_search'],
automatic_index=root_properties['automatic_index'],
search_bar=root_properties['search_bar'],
index_entries=[{
'title': entry.get('title', ''),
'is_dir': entry.get('is_dir', False),
'path': str(entry.get('path', Path(''))),
} for entry in root_properties.get('index_entries', '')],
)
root_properties['dst_path']['raw'].joinpath('index.html').write_text(
html)
2024-01-02 18:22:15 +00:00
# render each file
for file in files:
2024-01-02 18:20:47 +00:00
# don't render readme.md as index as it is used for directory
if file == "readme.md":
2024-01-02 04:02:48 +00:00
continue
2024-01-02 18:22:15 +00:00
render_file(root.joinpath(file))
2023-09-17 19:18:24 +00:00
process_home_index(args)
2024-01-02 18:22:15 +00:00
# copy styling and js scripts necessary for function
2024-01-02 03:54:09 +00:00
shutil.copytree(GRONK_CSS_DIR,
args.output_dir.joinpath('css'),
dirs_exist_ok=True)
shutil.copytree(GRONK_JS_DIR,
args.output_dir.joinpath('js'),
dirs_exist_ok=True)
2024-01-02 18:22:15 +00:00
2023-09-17 19:18:24 +00:00
generate_tag_browser(args.output_dir.joinpath('tags'))
2024-01-02 04:02:23 +00:00
generate_permalink_page(args.output_dir)
2021-06-29 13:35:21 +00:00
elapsed_time = time.time() - start_time
print(f"generated notes {elapsed_time=}")
2021-06-29 13:35:21 +00:00
return 0
2024-01-02 15:40:48 +00:00
def start_pandoc_server():
"""
attempt to get the version of pandoc server in a loop until it is
successful and return version as string
"""
start_time = time.time()
process = subprocess.Popen(["/usr/bin/pandoc-server"],
stdout=subprocess.PIPE)
version = None
while True:
try:
resp = requests.get(f"{PANDOC_SERVER_URL}/version")
version = resp.content.decode('utf-8')
break
except requests.ConnectionError:
time.sleep(0.1)
2024-01-02 17:52:24 +00:00
rc = process.poll()
if rc is not None:
print(f"PANDOC SERVER FAILED TO START: {rc=}")
print(process.stdout.read().decode("utf-8"))
raise Exception("Pandoc server failed to start")
2024-01-02 15:40:48 +00:00
elapsed_time = time.time() - start_time
print(f"pandoc-server started {version=} {elapsed_time=}")
return process
2024-01-02 18:22:15 +00:00
# TODO implement useful logging and debug printing
2021-06-29 13:35:21 +00:00
if __name__ == '__main__':
2024-01-02 15:40:48 +00:00
pandoc_process = start_pandoc_server()
2021-06-29 13:35:21 +00:00
try:
sys.exit(main(get_args()))
except KeyboardInterrupt:
sys.exit(0)
finally:
pandoc_process.kill()