gronk/notes2web.py

343 lines
12 KiB
Python
Raw Normal View History

2021-06-29 13:35:21 +00:00
#!/usr/bin/env python3
from bs4 import BeautifulSoup as bs
2021-08-21 01:14:12 +00:00
import subprocess
2021-08-15 19:40:13 +00:00
import frontmatter
2021-06-29 13:35:21 +00:00
import magic
import sys
import pathlib
import pypandoc
import shutil
import os
import re
2021-08-19 13:43:42 +00:00
import json
2021-06-29 13:35:21 +00:00
2021-06-29 13:35:21 +00:00
TEXT_ARTICLE_TEMPLATE_FOOT = None
TEXT_ARTICLE_TEMPLATE_HEAD = None
INDEX_TEMPLATE_FOOT = None
INDEX_TEMPLATE_HEAD = None
EXTRA_INDEX_CONTENT = None
2021-06-29 13:35:21 +00:00
2021-06-29 13:35:21 +00:00
def get_files(folder):
markdown = []
plaintext = []
other = []
for root, folders, files in os.walk(folder):
for filename in files:
2021-08-19 13:43:42 +00:00
if '/.git' in root:
continue
2021-06-29 13:35:21 +00:00
name = os.path.join(root, filename)
2021-08-24 12:52:47 +00:00
if pathlib.Path(name).suffix == '.md':
2021-06-29 13:35:21 +00:00
markdown.append(name)
elif re.match(r'^text/', magic.from_file(name, mime=True)):
plaintext.append(name)
other.append(name)
else:
other.append(name)
return markdown, plaintext, other
2021-08-21 01:14:12 +00:00
def git_filehistory(working_dir, filename):
print(f"{pathlib.Path(filename).relative_to(working_dir)=}")
git_response = subprocess.run(
[
'git',
2021-08-24 12:52:47 +00:00
f"--git-dir={working_dir.joinpath('.git')}",
2021-08-21 01:14:12 +00:00
"log",
"-p",
"--",
pathlib.Path(filename).relative_to(working_dir)
],
stdout=subprocess.PIPE
)
filehistory = [f"File history not available: git log returned code {git_response.returncode}."
"\nIf this is not a git repository, this is not a problem."]
2021-08-21 01:14:12 +00:00
if git_response.returncode == 0:
filehistory = git_response.stdout.decode('utf-8')
temp = re.split(
r'(commit [a-f0-9]{40})',
filehistory,
flags=re.IGNORECASE
)
for t in temp:
if t == '':
temp.remove(t)
filehistory = []
for i in range(0, len(temp)-1, 2):
filehistory.append(f"{temp[i]}{temp[i+1]}")
2021-08-21 01:14:12 +00:00
if filehistory == "":
filehistory = ["This file has no history (it may not be part of the git repository)."]
filehistory = "<pre>\n" + "</pre><pre>\n".join(filehistory) + "</pre>"
2021-08-21 01:14:12 +00:00
return filehistory
2021-06-29 13:35:21 +00:00
def get_dirs(folder):
r = []
for root, folders, files in os.walk(folder):
[r.append(os.path.join(root, folder)) for folder in folders]
return r
2021-08-15 18:34:29 +00:00
def update_required(src_filename, output_filename):
return not os.path.exists(output_filename) or os.path.getmtime(src_filename) > os.path.getmtime(output_filename)
2021-06-29 13:35:21 +00:00
def get_args():
""" Get command line arguments """
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('notes', type=pathlib.Path)
parser.add_argument('-o', '--output-dir', type=pathlib.Path, default='web')
parser.add_argument('-t', '--template', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/article.html'))
parser.add_argument('-H', '--template-text-head', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/textarticlehead.html'))
parser.add_argument('-f', '--template-text-foot', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/textarticlefoot.html'))
parser.add_argument('-i', '--template-index-head', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/indexhead.html'))
parser.add_argument('-I', '--template-index-foot', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/indexfoot.html'))
parser.add_argument('-s', '--stylesheet', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/styles.css'))
parser.add_argument('--home_index', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/home_index.html'))
parser.add_argument('-e', '--extra-index-content', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/extra_index_content.html'))
parser.add_argument('-n', '--index-article-names', action='append', default=['index.md'])
2021-08-15 18:34:29 +00:00
parser.add_argument('-F', '--force', action="store_true", help="Generate new output html even if source file was modified before output html")
2021-08-19 13:43:42 +00:00
parser.add_argument('--fuse', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/fuse.js'))
parser.add_argument('--searchjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/search.js'))
2021-06-29 13:35:21 +00:00
return parser.parse_args()
def main(args):
""" Entry point for script """
with open(args.template_text_foot) as fp:
TEXT_ARTICLE_TEMPLATE_FOOT = fp.read()
with open(args.template_text_head) as fp:
TEXT_ARTICLE_TEMPLATE_HEAD = fp.read()
with open(args.template_index_foot) as fp:
INDEX_TEMPLATE_FOOT = fp.read()
with open(args.template_index_head) as fp:
INDEX_TEMPLATE_HEAD = fp.read()
with open(args.extra_index_content) as fp:
EXTRA_INDEX_CONTENT = fp.read()
2021-08-24 13:39:30 +00:00
if args.output_dir.is_file():
2021-08-24 12:52:47 +00:00
print(f"Output directory ({args.output_dir}) cannot be a file.")
2021-06-29 13:35:21 +00:00
2021-08-24 12:52:47 +00:00
args.output_dir.mkdir(parents=True, exist_ok=True)
2021-06-29 13:35:21 +00:00
markdown_files, plaintext_files, other_files = get_files(args.notes)
2021-08-24 12:52:47 +00:00
all_entries=[]
dirs_with_index_article = []
2021-08-24 12:52:47 +00:00
tag_dict = {}
2021-06-29 13:35:21 +00:00
print(f"{markdown_files=}")
for filename in markdown_files:
2021-06-29 18:22:25 +00:00
print(f"{filename=}")
2021-08-15 19:40:13 +00:00
2021-08-24 12:52:47 +00:00
# calculate output filename
output_filename = args.output_dir.joinpath('notes').joinpath(
pathlib.Path(filename).relative_to(args.notes)
).with_suffix('.html')
if os.path.basename(filename) in args.index_article_names:
2021-08-24 12:52:47 +00:00
output_filename = output_filename.parent.joinpath('index.html')
dirs_with_index_article.append(str(output_filename.parent))
print(f"{output_filename=}")
2021-08-15 19:40:13 +00:00
2021-08-24 12:52:47 +00:00
# extract tags from frontmatter, save to tag_dict
2021-08-15 19:40:13 +00:00
fm = frontmatter.load(filename)
if isinstance(fm.get('tags'), list):
for tag in fm.get('tags'):
2021-08-24 12:52:47 +00:00
t = {
'path': str(pathlib.Path(output_filename).relative_to(args.output_dir)),
'title': fm.get('title') or pathlib.Path(filename).name
2021-08-24 12:52:47 +00:00
}
2021-08-15 19:40:13 +00:00
if tag in tag_dict.keys():
2021-08-24 12:52:47 +00:00
tag_dict[tag].append(t)
2021-08-15 19:40:13 +00:00
else:
2021-08-24 12:52:47 +00:00
tag_dict[tag] = [t]
# find headers in markdown
2021-08-20 13:31:34 +00:00
with open(filename) as fp:
lines = fp.read().split('\n')
header_lines = []
for line in lines:
if re.match('^#{1,6} \S', line):
header_lines.append(" ".join(line.split(" ")[1:]))
2021-08-19 13:43:42 +00:00
all_entries.append({
'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])),
'title': fm.get('title') or pathlib.Path(filename).name,
2021-08-20 13:31:34 +00:00
'tags': fm.get('tags'),
'headers': header_lines
2021-08-19 13:43:42 +00:00
})
2021-06-29 13:35:21 +00:00
2021-08-24 12:52:47 +00:00
# update file if required
2021-08-15 18:34:29 +00:00
if update_required(filename, output_filename) or args.force:
2021-08-24 12:52:47 +00:00
filehistory = git_filehistory(args.notes, filename)
2021-08-21 01:14:12 +00:00
html = pypandoc.convert_file(filename, 'html', extra_args=[f'--template={args.template}', '-V', f'filehistory={filehistory}'])
2021-08-24 12:52:47 +00:00
pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True)
2021-08-15 18:34:29 +00:00
with open(output_filename, 'w+') as fp:
fp.write(html)
2021-06-29 13:35:21 +00:00
print(f"{plaintext_files=}")
for filename in plaintext_files:
2021-08-21 01:14:12 +00:00
filehistory = git_filehistory(args.notes, filename)
2021-08-24 12:52:47 +00:00
title = os.path.basename(filename)
output_filename = str(
args.output_dir.joinpath('notes').joinpath(
pathlib.Path(filename).relative_to(args.notes)
)
) + '.html'
print(f"{output_filename=}")
pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True)
2021-06-29 13:35:21 +00:00
html = re.sub(r'\$title\$', title, TEXT_ARTICLE_TEMPLATE_HEAD)
2021-08-15 19:40:13 +00:00
html = re.sub(r'\$h1title\$', title, html)
2021-06-29 13:35:21 +00:00
html = re.sub(r'\$raw\$', os.path.basename(filename), html)
2021-08-21 01:29:00 +00:00
html = html.replace('$filehistory$', filehistory)
2021-06-29 13:35:21 +00:00
with open(filename) as fp:
2021-08-22 20:47:54 +00:00
html += fp.read().replace("<", "&lt;").replace(">", "&gt;")
2021-06-29 13:35:21 +00:00
html += TEXT_ARTICLE_TEMPLATE_FOOT
with open(output_filename, 'w+') as fp:
fp.write(html)
2021-08-19 13:43:42 +00:00
all_entries.append({
'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])),
'title': title,
2021-08-20 13:31:34 +00:00
'tags': [],
'headers': []
2021-08-19 13:43:42 +00:00
})
2021-06-29 13:35:21 +00:00
print(f"{other_files=}")
for filename in other_files:
2021-08-24 12:52:47 +00:00
output_filename = str(
args.output_dir.joinpath('notes').joinpath(
pathlib.Path(filename).relative_to(args.notes)
)
)
pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True)
2021-08-19 13:43:42 +00:00
all_entries.append({
'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])),
'title': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])),
2021-08-20 13:31:34 +00:00
'tags': [],
'headers': []
2021-08-19 13:43:42 +00:00
})
2021-06-29 13:35:21 +00:00
shutil.copyfile(filename, output_filename)
2021-08-24 12:52:47 +00:00
tagdir = args.output_dir.joinpath('.tags')
tagdir.mkdir(parents=True, exist_ok=True)
2021-08-15 19:40:13 +00:00
for tag in tag_dict.keys():
html = re.sub(r'\$title\$', f'{tag}', INDEX_TEMPLATE_HEAD)
html = re.sub(r'\$h1title\$', f'tag: {tag}', html)
html = re.sub(r'\$extra_content\$', '', html)
for entry in tag_dict[tag]:
html += f"<div class=\"article\"><a href=\"/{entry['path']}\">{entry['title']}</a></div>"
html += INDEX_TEMPLATE_FOOT
2021-08-24 12:52:47 +00:00
with open(tagdir.joinpath(f'{tag}.html'), 'w+') as fp:
2021-08-15 19:40:13 +00:00
fp.write(html)
2021-06-29 13:35:21 +00:00
dirs_to_index = [args.output_dir.name] + get_dirs(args.output_dir)
print(f"{dirs_to_index=}")
2021-08-24 12:52:47 +00:00
print(f"{dirs_with_index_article=}")
2021-08-24 12:52:47 +00:00
for d in dirs_to_index:
print(f"{d in dirs_with_index_article=} {d=}")
if d in dirs_with_index_article:
continue
2021-08-24 12:52:47 +00:00
directory = pathlib.Path(d)
2021-06-29 13:35:21 +00:00
paths = os.listdir(directory)
2021-08-24 12:52:47 +00:00
#print(f"{paths=}")
2021-06-29 13:35:21 +00:00
indexentries = []
2021-08-24 12:52:47 +00:00
for p in paths:
path = pathlib.Path(p)
#print(f"{path=}")
if p in [ 'index.html', '.git' ]:
2021-06-29 13:35:21 +00:00
continue
2021-08-24 12:52:47 +00:00
fullpath = directory.joinpath(path)
if path.suffix == '.html':
2021-06-29 13:35:21 +00:00
with open(fullpath) as fp:
soup = bs(fp.read(), 'html.parser')
try:
title = soup.find('title').get_text() or pathlib.Path(path).name
2021-06-29 13:35:21 +00:00
except AttributeError:
title = pathlib.Path(path).stem
2021-08-24 12:52:47 +00:00
elif fullpath.is_dir():
2021-06-29 13:35:21 +00:00
title = path
2021-07-29 13:06:03 +00:00
else:
# don't add plaintext files to index, since they have a html wrapper
continue
2021-06-29 13:35:21 +00:00
2021-08-24 12:52:47 +00:00
if str(title).strip() == '':
2021-06-29 13:35:21 +00:00
title = path
indexentries.append({
2021-08-24 12:52:47 +00:00
'title': str(title),
'path': str(path),
'isdirectory': fullpath.is_dir()
2021-06-29 13:35:21 +00:00
})
2021-09-01 13:08:00 +00:00
indexentries.sort(key=lambda entry: str(entry['title']).lower())
2021-06-29 13:35:21 +00:00
indexentries.sort(key=lambda entry: entry['isdirectory'], reverse=True)
2021-08-24 12:52:47 +00:00
html = re.sub(r'\$title\$', str(directory), INDEX_TEMPLATE_HEAD)
html = re.sub(r'\$h1title\$', str(directory), html)
html = re.sub(r'\$extra_content\$',
2021-08-24 12:52:47 +00:00
EXTRA_INDEX_CONTENT if directory == args.notes else '',
html
)
2021-06-29 13:35:21 +00:00
for entry in indexentries:
html += f"<div class=\"article\"><a href=\"{entry['path']}\">{entry['title']}{'/' if entry['isdirectory'] else ''}</a></div>"
html += INDEX_TEMPLATE_FOOT
2021-08-24 12:52:47 +00:00
with open(directory.joinpath('index.html'), 'w+') as fp:
2021-06-29 13:35:21 +00:00
fp.write(html)
2021-08-24 12:52:47 +00:00
shutil.copyfile(args.stylesheet, args.output_dir.joinpath('styles.css'))
shutil.copyfile(args.fuse, args.output_dir.joinpath('fuse.js'))
shutil.copyfile(args.searchjs, args.output_dir.joinpath('search.js'))
with open(args.output_dir.joinpath('index.html'), 'w+') as fp:
with open(args.home_index) as fp2:
html = re.sub(r'\$title\$', args.output_dir.parts[0], fp2.read())
html = re.sub(r'\$h1title\$', args.output_dir.parts[0], html)
2021-08-19 13:43:42 +00:00
html = re.sub(r'\$data\$', json.dumps(all_entries), html)
fp.write(html)
2021-08-15 19:40:13 +00:00
print(tag_dict)
2021-06-29 13:35:21 +00:00
return 0
if __name__ == '__main__':
try:
sys.exit(main(get_args()))
except KeyboardInterrupt:
sys.exit(0)