#!/usr/bin/env python3 from bs4 import BeautifulSoup as bs import subprocess import frontmatter import magic import sys import pathlib import pypandoc import shutil import os import regex as re import json TEXT_ARTICLE_TEMPLATE_FOOT = None TEXT_ARTICLE_TEMPLATE_HEAD = None INDEX_TEMPLATE_FOOT = None INDEX_TEMPLATE_HEAD = None EXTRA_INDEX_CONTENT = None def get_files(folder): markdown = [] plaintext = [] other = [] for root, folders, files in os.walk(folder): for filename in files: if '/.git' in root: continue name = os.path.join(root, filename) if pathlib.Path(name).suffix == '.md': markdown.append(name) elif re.match(r'^text/', magic.from_file(name, mime=True)): plaintext.append(name) other.append(name) else: other.append(name) return markdown, plaintext, other def git_filehistory(working_dir, filename): print(f"{pathlib.Path(filename).relative_to(working_dir)=}") git_response = subprocess.run( [ 'git', f"--git-dir={working_dir.joinpath('.git')}", "log", "-p", "--", pathlib.Path(filename).relative_to(working_dir) ], stdout=subprocess.PIPE ) filehistory = [f"File history not available: git log returned code {git_response.returncode}." "\nIf this is not a git repository, this is not a problem."] if git_response.returncode == 0: filehistory = git_response.stdout.decode('utf-8') temp = re.split( r'(commit [a-f0-9]{40})', filehistory, flags=re.IGNORECASE ) for t in temp: if t == '': temp.remove(t) filehistory = [] for i in range(0, len(temp)-1, 2): filehistory.append(f"{temp[i]}{temp[i+1]}") if filehistory == "": filehistory = ["This file has no history (it may not be part of the git repository)."] filehistory = [ x.replace("<", "<").replace(">", ">") for x in filehistory] filehistory = "
\n" + "
\n".join(filehistory) + "
" return filehistory def get_dirs_to_index(folder): r = [] for root, folders, files in os.walk(folder): if pathlib.Path(os.path.join(root, folder)).is_relative_to(folder.joinpath('permalink')): continue [r.append(os.path.join(root, folder)) for folder in folders] return r def update_required(src_filename, output_filename): return not os.path.exists(output_filename) or os.path.getmtime(src_filename) > os.path.getmtime(output_filename) def get_args(): """ Get command line arguments """ import argparse parser = argparse.ArgumentParser() parser.add_argument('notes', type=pathlib.Path) parser.add_argument('-o', '--output-dir', type=pathlib.Path, default='web') parser.add_argument('-t', '--template', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/article.html')) parser.add_argument('-H', '--template-text-head', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/textarticlehead.html')) parser.add_argument('-f', '--template-text-foot', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/textarticlefoot.html')) parser.add_argument('-i', '--template-index-head', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/indexhead.html')) parser.add_argument('-I', '--template-index-foot', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/indexfoot.html')) parser.add_argument('-s', '--stylesheet', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/styles.css')) parser.add_argument('--home_index', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/home_index.html')) parser.add_argument('--permalink_index', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/permalink_index.html')) parser.add_argument('-e', '--extra-index-content', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/extra_index_content.html')) parser.add_argument('-n', '--index-article-names', action='append', default=['index.md']) parser.add_argument('-F', '--force', action="store_true", help="Generate new output html even if source file was modified before output html") parser.add_argument('--fuse', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/fuse.js')) parser.add_argument('--searchjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/search.js')) parser.add_argument('--permalinkjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/permalink.js')) parser.add_argument('--tocsearchjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/toc_search.js')) parser.add_argument('--toc-depth', type=int, default=6, dest='toc_depth') return parser.parse_args() def main(args): """ Entry point for script """ with open(args.template_text_foot) as fp: TEXT_ARTICLE_TEMPLATE_FOOT = fp.read() with open(args.template_text_head) as fp: TEXT_ARTICLE_TEMPLATE_HEAD = fp.read() with open(args.template_index_foot) as fp: INDEX_TEMPLATE_FOOT = fp.read() with open(args.template_index_head) as fp: INDEX_TEMPLATE_HEAD = fp.read() with open(args.extra_index_content) as fp: EXTRA_INDEX_CONTENT = fp.read() if args.output_dir.is_file(): print(f"Output directory ({args.output_dir}) cannot be a file.") args.output_dir.mkdir(parents=True, exist_ok=True) notes_license = "This note has no copyright license.", print(f"{notes_license=}") license_path = args.notes.joinpath("LICENSE") if license_path.exists(): with open(license_path) as fp: notes_license = fp.read() markdown_files, plaintext_files, other_files = get_files(args.notes) all_entries=[] dirs_with_index_article = [] tag_dict = {} permalink_to_filepath = {} print(f"{markdown_files=}") for filename in markdown_files: print(f"{filename=}") # calculate output filename output_filename = args.output_dir.joinpath('notes').joinpath( pathlib.Path(filename).relative_to(args.notes) ).with_suffix('.html') if os.path.basename(filename) in args.index_article_names: output_filename = output_filename.parent.joinpath('index.html') dirs_with_index_article.append(str(output_filename.parent)) print(f"{output_filename=}") # extract tags from frontmatter, save to tag_dict fm = frontmatter.load(filename) if isinstance(fm.get('tags'), list): for tag in fm.get('tags'): t = { 'path': str(pathlib.Path(output_filename).relative_to(args.output_dir)), 'title': fm.get('title') or pathlib.Path(filename).name } if tag in tag_dict.keys(): tag_dict[tag].append(t) else: tag_dict[tag] = [t] # find headers in markdown with open(filename) as fp: lines = fp.read().split('\n') header_lines = [] for line in lines: if re.match('^#{1,6} \S', line): header_lines.append(" ".join(line.split(" ")[1:])) all_entries.append({ 'path': '/' + str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'title': fm.get('title') or pathlib.Path(filename).name, 'tags': fm.get('tags'), 'headers': header_lines, 'uuid': fm.get('uuid') }) if 'uuid' in fm.keys(): permalink_to_filepath[fm['uuid']] = all_entries[-1]['path'] # update file if required if update_required(filename, output_filename) or args.force: filehistory = git_filehistory(args.notes, filename) html = pypandoc.convert_file(filename, 'html', extra_args=[ f'--template={args.template}', '-V', f'filehistory={filehistory}', '-V', f'licenseFull={notes_license}', '--mathjax', '--toc', f'--toc-depth={args.toc_depth}' ]) pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True) with open(output_filename, 'w+') as fp: fp.write(html) print(f"{plaintext_files=}") for filename in plaintext_files: filehistory = git_filehistory(args.notes, filename) title = os.path.basename(filename) output_filename = str( args.output_dir.joinpath('notes').joinpath( pathlib.Path(filename).relative_to(args.notes) ) ) + '.html' print(f"{output_filename=}") pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True) html = re.sub(r'\$title\$', title, TEXT_ARTICLE_TEMPLATE_HEAD) html = re.sub(r'\$h1title\$', title, html) html = re.sub(r'\$raw\$', os.path.basename(filename), html) html = re.sub(r'\$licenseFull\$', notes_license, html) html = html.replace('$filehistory$', filehistory) with open(filename) as fp: html += fp.read().replace("<", "<").replace(">", ">") html += TEXT_ARTICLE_TEMPLATE_FOOT with open(output_filename, 'w+') as fp: fp.write(html) all_entries.append({ 'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'title': title, 'tags': [], 'headers': [] }) print(f"{other_files=}") for filename in other_files: output_filename = str( args.output_dir.joinpath('notes').joinpath( pathlib.Path(filename).relative_to(args.notes) ) ) pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True) all_entries.append({ 'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'title': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'tags': [], 'headers': [] }) shutil.copyfile(filename, output_filename) tagdir = args.output_dir.joinpath('.tags') tagdir.mkdir(parents=True, exist_ok=True) for tag in tag_dict.keys(): html = re.sub(r'\$title\$', f'{tag}', INDEX_TEMPLATE_HEAD) html = re.sub(r'\$h1title\$', f'tag: {tag}', html) html = re.sub(r'\$extra_content\$', '', html) for entry in tag_dict[tag]: html += f"
{entry['title']}
" html += INDEX_TEMPLATE_FOOT with open(tagdir.joinpath(f'{tag}.html'), 'w+') as fp: fp.write(html) dirs_to_index = [args.output_dir.name] + get_dirs_to_index(args.output_dir) print(f"{dirs_to_index=}") print(f"{dirs_with_index_article=}") for d in dirs_to_index: print(f"{d in dirs_with_index_article=} {d=}") if d in dirs_with_index_article: continue directory = pathlib.Path(d) paths = os.listdir(directory) #print(f"{paths=}") indexentries = [] for p in paths: path = pathlib.Path(p) #print(f"{path=}") if p in [ 'index.html', '.git' ]: continue fullpath = directory.joinpath(path) if path.suffix == '.html': with open(fullpath) as fp: soup = bs(fp.read(), 'html.parser') try: title = soup.find('title').get_text() or pathlib.Path(path).name except AttributeError: title = pathlib.Path(path).stem elif fullpath.is_dir(): title = path else: # don't add plaintext files to index, since they have a html wrapper continue if str(title).strip() == '': title = path indexentries.append({ 'title': str(title), 'path': str(path), 'isdirectory': fullpath.is_dir() }) indexentries.sort(key=lambda entry: str(entry['title']).lower()) indexentries.sort(key=lambda entry: entry['isdirectory'], reverse=True) html = re.sub(r'\$title\$', str(directory), INDEX_TEMPLATE_HEAD) html = re.sub(r'\$h1title\$', str(directory), html) html = re.sub(r'\$extra_content\$', EXTRA_INDEX_CONTENT if directory == args.notes else '', html ) for entry in indexentries: html += ( '
  • ' f'

    ' f'{entry["title"]}{"/" if entry["isdirectory"] else ""}' '

    ' '
  • ' ) html += INDEX_TEMPLATE_FOOT with open(directory.joinpath('index.html'), 'w+') as fp: fp.write(html) shutil.copyfile(args.stylesheet, args.output_dir.joinpath('styles.css')) shutil.copyfile(args.fuse, args.output_dir.joinpath('fuse.js')) shutil.copyfile(args.searchjs, args.output_dir.joinpath('search.js')) shutil.copyfile(args.tocsearchjs, args.output_dir.joinpath('toc_search.js')) shutil.copyfile(args.permalinkjs, args.output_dir.joinpath('permalink.js')) with open(args.output_dir.joinpath('index.html'), 'w+') as fp: with open(args.home_index) as fp2: html = re.sub(r'\$title\$', args.output_dir.parts[0], fp2.read()) html = re.sub(r'\$h1title\$', args.output_dir.parts[0], html) html = re.sub(r'\$data\$', json.dumps(all_entries), html) fp.write(html) permalink_dir = args.output_dir.joinpath('permalink') permalink_dir.mkdir(exist_ok=True) with open(args.permalink_index) as fp: html = re.sub(r'\$data\$', json.dumps(permalink_to_filepath), fp.read()) with open(permalink_dir.joinpath('index.html'), 'w+') as fp: fp.write(html) print(tag_dict) return 0 if __name__ == '__main__': try: sys.exit(main(get_args())) except KeyboardInterrupt: sys.exit(0)