#!/usr/bin/env python3 from bs4 import BeautifulSoup as bs import subprocess import frontmatter import magic import sys import pathlib import pypandoc import shutil import os import regex as re import json import yaml TEXT_ARTICLE_TEMPLATE_FOOT = None TEXT_ARTICLE_TEMPLATE_HEAD = None INDEX_TEMPLATE_FOOT = None INDEX_TEMPLATE_HEAD = None EXTRA_INDEX_CONTENT = None N2W_COMMIT = "" # how many characters of a commit to show on landing page COMMIT_SHA1_NCHARS = 7 def is_plaintext(filename): return re.match(r'^text/', magic.from_file(str(filename), mime=True)) is not None def get_files(folder): markdown = [] plaintext = [] other = [] for root, folders, files in os.walk(folder): for filename in files: if '/.git' in root: continue name = os.path.join(root, filename) if pathlib.Path(name).suffix == '.md': markdown.append(name) elif is_plaintext(name): plaintext.append(name) other.append(name) else: other.append(name) return markdown, plaintext, other def get_inherited_tags(file, base_folder): tags = [] folder = pathlib.Path(file) while folder != base_folder.parent: print(f"get_inherited_tags {folder=}") folder = pathlib.Path(folder).parent folder_metadata = folder.joinpath('.n2w.yml') if not folder_metadata.exists(): continue with open(folder.joinpath('.n2w.yml')) as fp: folder_properties = yaml.safe_load(fp) tags += folder_properties.get('itags') print(f"get_inherited_tags {tags=}") return list(set(tags)) def git_head_sha1(working_dir): git_response = subprocess.run( [ 'git', f"--git-dir={working_dir.joinpath('.git')}", 'rev-parse', 'HEAD' ], stdout=subprocess.PIPE ).stdout.decode('utf-8') return git_response.strip() def git_filehistory(working_dir, filename): print(f"{pathlib.Path(filename).relative_to(working_dir)=}") git_response = subprocess.run( [ 'git', f"--git-dir={working_dir.joinpath('.git')}", "log", "-p", "--", pathlib.Path(filename).relative_to(working_dir) ], stdout=subprocess.PIPE ) filehistory = [f"File history not available: git log returned code {git_response.returncode}." "\nIf this is not a git repository, this is not a problem."] if git_response.returncode == 0: filehistory = git_response.stdout.decode('utf-8') temp = re.split( r'(commit [a-f0-9]{40})', filehistory, flags=re.IGNORECASE ) for t in temp: if t == '': temp.remove(t) filehistory = [] for i in range(0, len(temp)-1, 2): filehistory.append(f"{temp[i]}{temp[i+1]}") if filehistory == "": filehistory = ["This file has no history (it may not be part of the git repository)."] filehistory = [ x.replace("<", "<").replace(">", ">") for x in filehistory] filehistory = "
\n" + "
\n".join(filehistory) + "" return filehistory def get_dirs_to_index(folder): r = [] for root, folders, files in os.walk(folder): if pathlib.Path(os.path.join(root, folder)).is_relative_to(folder.joinpath('permalink')): continue [r.append(os.path.join(root, folder)) for folder in folders] return r def update_required(src_filename, output_filename): return not os.path.exists(output_filename) or os.path.getmtime(src_filename) > os.path.getmtime(output_filename) def get_args(): """ Get command line arguments """ import argparse parser = argparse.ArgumentParser() parser.add_argument('notes', type=pathlib.Path) parser.add_argument('-o', '--output-dir', type=pathlib.Path, default='web') parser.add_argument('-t', '--template', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/article.html')) parser.add_argument('-H', '--template-text-head', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/textarticlehead.html')) parser.add_argument('-f', '--template-text-foot', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/textarticlefoot.html')) parser.add_argument('-i', '--template-index-head', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/indexhead.html')) parser.add_argument('-I', '--template-index-foot', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/indexfoot.html')) parser.add_argument('-s', '--stylesheet', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/styles.css')) parser.add_argument('--home_index', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/home_index.html')) parser.add_argument('--permalink_index', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/permalink_index.html')) parser.add_argument('-e', '--extra-index-content', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/templates/extra_index_content.html')) parser.add_argument('-n', '--index-article-names', action='append', default=['index.md']) parser.add_argument('-F', '--force', action="store_true", help="Generate new output html even if source file was modified before output html") parser.add_argument('--fuse', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/fuse.js')) parser.add_argument('--searchjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/search.js')) parser.add_argument('--permalinkjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/permalink.js')) parser.add_argument('--tocsearchjs', type=pathlib.Path, default=pathlib.Path('/opt/notes2web/toc_search.js')) parser.add_argument('--toc-depth', type=int, default=6, dest='toc_depth') return parser.parse_args() def main(args): """ Entry point for script """ with open(args.template_text_foot) as fp: TEXT_ARTICLE_TEMPLATE_FOOT = fp.read() with open(args.template_text_head) as fp: TEXT_ARTICLE_TEMPLATE_HEAD = fp.read() with open(args.template_index_foot) as fp: INDEX_TEMPLATE_FOOT = fp.read() with open(args.template_index_head) as fp: INDEX_TEMPLATE_HEAD = fp.read() with open(args.extra_index_content) as fp: EXTRA_INDEX_CONTENT = fp.read() if args.output_dir.is_file(): print(f"Output directory ({args.output_dir}) cannot be a file.") args.output_dir.mkdir(parents=True, exist_ok=True) notes_license = "This note has no copyright license.", print(f"{notes_license=}") license_path = args.notes.joinpath("LICENSE") if license_path.exists(): with open(license_path) as fp: notes_license = fp.read() markdown_files, plaintext_files, other_files = get_files(args.notes) all_entries=[] dirs_with_index_article = [] tag_dict = {} permalink_to_filepath = {} print(f"{markdown_files=}") for filename in markdown_files: print(f"{filename=}") # calculate output filename output_filename = args.output_dir.joinpath('notes').joinpath( pathlib.Path(filename).relative_to(args.notes) ).with_suffix('.html') if os.path.basename(filename) in args.index_article_names: output_filename = output_filename.parent.joinpath('index.html') dirs_with_index_article.append(str(output_filename.parent)) print(f"{output_filename=}") # extract tags from frontmatter, save to tag_dict fm = frontmatter.load(filename) if isinstance(fm.get('tags'), list): for tag in list(set(fm.get('tags') + get_inherited_tags(filename, args.notes))): t = { 'path': str(pathlib.Path(output_filename).relative_to(args.output_dir)), 'title': fm.get('title') or pathlib.Path(filename).name } if tag in tag_dict.keys(): tag_dict[tag].append(t) else: tag_dict[tag] = [t] # find headers in markdown with open(filename) as fp: lines = fp.read().split('\n') header_lines = [] for line in lines: if re.match('^#{1,6} \S', line): header_lines.append(" ".join(line.split(" ")[1:])) all_entries.append({ 'path': '/' + str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'title': fm.get('title') or pathlib.Path(filename).name, 'tags': list(set(fm.get('tags'))), 'headers': header_lines, 'uuid': fm.get('uuid') }) if 'uuid' in fm.keys(): permalink_to_filepath[fm['uuid']] = all_entries[-1]['path'] # update file if required if update_required(filename, output_filename) or args.force: filehistory = git_filehistory(args.notes, filename) with open(filename) as fp: article = frontmatter.load(fp) article['tags'] += get_inherited_tags(filename, args.notes) article['tags'] = sorted(list(set(article['tags']))) article['filehistory'] = filehistory article['licenseFull'] = notes_license html = pypandoc.convert_text(frontmatter.dumps(article), 'html', format='md', extra_args=[ f'--template={args.template}', '--mathjax', '--toc', f'--toc-depth={args.toc_depth}' ]) pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True) with open(output_filename, 'w+') as fp: fp.write(html) print(f"{plaintext_files=}") for filename in plaintext_files: filehistory = git_filehistory(args.notes, filename) title = os.path.basename(filename) output_filename = str( args.output_dir.joinpath('notes').joinpath( pathlib.Path(filename).relative_to(args.notes) ) ) + '.html' print(f"{output_filename=}") pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True) html = re.sub(r'\$title\$', title, TEXT_ARTICLE_TEMPLATE_HEAD) html = re.sub(r'\$h1title\$', title, html) html = re.sub(r'\$raw\$', os.path.basename(filename), html) html = re.sub(r'\$licenseFull\$', notes_license, html) html = html.replace('$filehistory$', filehistory) with open(filename) as fp: html += fp.read().replace("<", "<").replace(">", ">") html += TEXT_ARTICLE_TEMPLATE_FOOT with open(output_filename, 'w+') as fp: fp.write(html) all_entries.append({ 'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'title': title, 'tags': [get_inherited_tags(filename, args.notes)], 'headers': [] }) print(f"{other_files=}") for filename in other_files: output_filename = str( args.output_dir.joinpath('notes').joinpath( pathlib.Path(filename).relative_to(args.notes) ) ) title = os.path.basename(filename) pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True) all_entries.append({ 'path': str(pathlib.Path(*pathlib.Path(output_filename).parts[1:])), 'title': title, 'tags': [get_inherited_tags(filename, args.notes)], 'headers': [] }) shutil.copyfile(filename, output_filename) tagdir = args.output_dir.joinpath('.tags') tagdir.mkdir(parents=True, exist_ok=True) for tag in tag_dict.keys(): html = re.sub(r'\$title\$', f'{tag}', INDEX_TEMPLATE_HEAD) html = re.sub(r'\$h1title\$', f'tag: {tag}', html) html = re.sub(r'\$extra_content\$', '', html) for entry in tag_dict[tag]: html += f"" html += INDEX_TEMPLATE_FOOT with open(tagdir.joinpath(f'{tag}.html'), 'w+') as fp: fp.write(html) dirs_to_index = [args.output_dir.name] + get_dirs_to_index(args.output_dir) print(f"{dirs_to_index=}") print(f"{dirs_with_index_article=}") for d in dirs_to_index: print(f"{d in dirs_with_index_article=} {d=}") if d in dirs_with_index_article: continue directory = pathlib.Path(d) paths = os.listdir(directory) #print(f"{paths=}") indexentries = [] for p in paths: path = pathlib.Path(p) #print(f"{path=}") if p in [ 'index.html', '.git' ]: continue fullpath = directory.joinpath(path) title = path.name if path.suffix == '.html': with open(fullpath) as fp: soup = bs(fp.read(), 'html.parser') try: title = soup.find('title').get_text() or pathlib.Path(path).name except AttributeError: title = pathlib.Path(path).stem elif fullpath.is_dir(): title = path elif is_plaintext(fullpath): # don't add plaintext files to index, since they have a html wrapper continue if str(title).strip() == '': title = path indexentries.append({ 'title': str(title), 'path': './' + str(path), 'isdirectory': fullpath.is_dir() }) indexentries.sort(key=lambda entry: str(entry['title']).lower()) indexentries.sort(key=lambda entry: entry['isdirectory'], reverse=True) html = re.sub(r'\$title\$', str(directory), INDEX_TEMPLATE_HEAD) html = re.sub(r'\$h1title\$', str(directory), html) html = re.sub(r'\$extra_content\$', EXTRA_INDEX_CONTENT if directory == args.notes else '', html ) for entry in indexentries: html += ( '
' f'{entry["title"]}{"/" if entry["isdirectory"] else ""}' '
' '