#!/usr/bin/python3
# coding: utf-8
# Copyright (C) 2016 Antoine Beaupré <anarcat@debian.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import division, absolute_import
from __future__ import print_function
import glob
import os
import re
import shlex
import shutil
import subprocess
import time
import logging
logger = logging.getLogger(__name__)
import click
from jinja2 import Template
from markdown import markdown
# workaround missing import issue in 0.5
from humanize.time import naturaldelta
from debmans.utils import mkdirp, find_static_file
@click.command()
@click.option('-s', '--srcdir', default='.',
help='where the source manuals are (default: output of the extract command)')
@click.pass_obj
def render(obj, srcdir):
'''render documentation to HTML
this looks for patterns matching a certain regex in the given
SRCDIR directory
.. note:: this assumes files have an extension that should be
stripped. for manpages, this should generally be
``.gz``. if manpages are not compressed, this will break
section support.
.. todo:: document that compressed manpages are mandatory
'''
output = obj['output']
mirror = obj['mirror']
patterns = obj['patterns']
# get modified paths from extractor, if it was also called
if 'changed_paths' in obj:
if srcdir:
logging.warn('ignoring --srcdir parameter because called with extractor')
srcdir = None
logging.info('received %d paths from extractor',
len(obj['changed_paths']))
filelist = match_jobs(obj['changed_paths'], patterns)
else:
logging.info('looking for patterns %s in %s', patterns, srcdir)
filelist = find_files(srcdir, patterns)
if obj['progress']:
progress = click.progressbar
else:
class fake_progress:
def __init__(self, it, label=None):
self.it = it
def __enter__(self):
return self.it
def __exit__(self, type, value, traceback):
pass
progress = fake_progress
i = 0
t = time.time()
with progress(list(filelist),
label='rendering manpages') as bar:
for renderer, source, match in bar:
if srcdir:
# from srcdir, replace it with output dir
target = source.replace(srcdir, output, 1)
else:
# from extractor: already in the output dir
target = source
# replace .gz extension with .html
base, ext = os.path.splitext(target)
target = os.path.abspath(base + '.html')
try:
renderer.render(source, target,
prefix=obj['prefix'],
suites=mirror.releases)
except CommandRendererError as e:
logging.warn('%s failed to convert %s: %s',
renderer, source, e)
i += 1
logger.info('rendered %d manpages in %s', i,
naturaldelta(time.time() - t))
[docs]class JinjaRenderer(object):
'''render Jinja templates using given parameters, caching and
simulation
this is basically an extension of the Template class, but extended
so we can easily pass paths (instead of strings) in and out.
.. todo:: we should probably have derived Template directly here.
'''
def __init__(self, template, cache=True, dryrun=False):
'''create a renderer
:param str template: path to the Jinja2 template
:param bool cache: do not overwrite output file if newer
:param bool dryrun: do not write anything in any case, useful
to test cache detection
'''
#: template to use to render the data
self.template = template
#: if we should check timestamps before writing
self.cache = cache
#: if True, do not write
self.dryrun = dryrun
#: source file currently processed
self.source = None
[docs] def generated_time(self):
'''handy function to add timestamp to footers'''
return 'Generated on %s' % time.strftime('%Y-%m-%d %H:%M:%S%Z')
[docs] def render(self, target, **data):
'''render template with given data
if ``pageinfo`` isn't provided in :data:`data`, it is set to
the output of :func:`generated_time`.
:param str target: path to the target file
:param dict data: set of parameters passed to :meth:`~jinja2.Template.render`
'''
if 'pageinfo' not in data:
data['pageinfo'] = self.generated_time()
if self.uptodate(target):
logging.debug('%s is up to date with template %s',
target, self.template)
else:
if self.dryrun:
logging.debug('dryrun: not writing file %s with template %s',
target, self.template)
return
logging.debug('writing file %s with template %s',
target, self.template)
mkdirp(os.path.dirname(target))
with open(self.template, 'r') as templatefile,\
open(target, 'w') as targetfile:
tmpl = Template(templatefile.read().decode('utf-8'))
targetfile.write(tmpl.render(data).encode('utf-8'))
[docs] def uptodate(self, target):
'''check if the target file is newer than template
also checks the :attr:`source` attribute if it is set, which
allos for subclasses to add a file to check.'''
return self.cache and os.path.exists(target) \
and os.stat(self.template).st_mtime <= os.stat(target).st_mtime \
and (self.source is None
or os.stat(self.source).st_mtime <= os.stat(target).st_mtime)
[docs]class MarkdownRenderer(JinjaRenderer):
'''render markdown source files with a jinja template'''
[docs] def render(self, source, target, **data):
'''render the given source file
:param str source: path to the Markdown source file
:param str target: passed to :func:`JinjaRenderer.render`
'''
self.source = source
with open(source, 'r') as sourcefile:
html = markdown(sourcefile.read())
super(MarkdownRenderer, self).render(target, content=html, **data)
[docs]class CommandRenderer(JinjaRenderer):
'''a simple template-based rendering system
a file is passed as an argument to a command and the output is
written into the given template, in the `{{content}}` Jinja2
element.
this is meant to be subclassed in command-specific renderers.
those can also not even be command-based, as long as they have the
following parameters:
- ``pattern``: regular expression pattern for this class
- ``render(source, target, **data)``: render the given source file
into the target file, with the attached Jinja data. at least
``content`` is expected in there, but ``description`` and
``title`` are also encouraged, those should match the template.
'''
def __init__(self, template, command=None, cache=True, dryrun=False):
'''create a renderer with the given command
:param str command: command to launch for this renderer
Other parameters passed as is to
:func:`JinjaRenderer.__init__`
'''
if command is not None:
self.command = command
super(CommandRenderer, self).__init__(template, cache, dryrun)
[docs] def postprocess(self, data):
'''modify the data sent to the template after execution
this allows subclasses to intervene between the command call
and the render call.
by default does nothing
'''
[docs] def render(self, source, target, **data):
'''render the given source file using external command defined in
constructor
does not call command in :attr:`dryrun` mode.
.. todo:: support %(target)s instead of standard output, if
necessary?
:param str source: path to the source file
:param str target: path to the output file
:param dict data: remaining arguments passed as is to
:func:`JinjaRenderer.render`
:raises :class:`CommandRendererError`: if command fails to convert given page
'''
# w3m parser requires absolute paths
self.source = os.path.abspath(source)
# build a safe command using the shell lexer
command = [x % {'source': self.source} for x in shlex.split(self.command)]
if self.uptodate(target):
logging.debug('%s is up to date, not running %s', target, command)
return
if self.dryrun:
logger.debug('dryrun: not running command %s', command)
data['content'] = ''
else:
logger.debug('running command %s', command)
try:
content = subprocess.check_output(command,
stderr=subprocess.PIPE)
except subprocess.CalledProcessError as e:
raise CommandRendererError('%s failed to convert %s: %s'
% (self.command, self.source, e))
data['content'] = content.decode('utf-8', 'replace')
self.postprocess(data)
super(CommandRenderer, self).render(target, **data)
[docs]class CommandRendererError(RuntimeError):
'''error raised when man2html fails to render the manpage'''
pass
[docs]class W3mRenderer(CommandRenderer, ManpageRenderer):
'''render manpages with w3m'''
#: path to w3m converter
command = '/usr/lib/w3m/cgi-bin/w3mman2html.cgi "quit=1&local=%(source)s"'
[docs] def postprocess(self, data):
'''process w3m parser output'''
content = data['content']
content = re.sub(r'<a href="file:///[^?]*\?([^(]*)\(([^)]*)\)">',
r'<a href="../man\2/\1.\2.html">', content)
# copy-pasted from Man2htmlRenderer
content = re.sub(r'^.*<body>', '', content,
flags=re.DOTALL | re.IGNORECASE)
content = re.sub(r'</body>.*$', '', content,
flags=re.DOTALL | re.IGNORECASE)
# find the page title
match = re.search(r'^<b>NAME</b>\s*^\s+(.*?)^<b>', content,
re.DOTALL | re.MULTILINE)
if match:
logger.debug('found page name %s', match.group(1).strip())
if '-' in match.group(1):
title, description = match.group(1).split('-')[:2]
data['title'] = title.strip()
data['description'] = description.strip()
else:
data['title'] = match.group(1).strip()
# fix broken headings
content = re.sub(r'\n<b>([^<]*)</b> +<b>', r'<b>\1 ', content)
# turn section headers into H2
content = re.sub(r'\n<b>([\w ]+)</b>', r'\n<h2>\1</h2>',
content, re.UNICODE)
data['content'] = content
[docs]class MandocRenderer(CommandRenderer, ManpageRenderer):
'''render pages with mandoc
.. todo:: this assumes cross-references are done with the ``.Xr``
macro, which is unfortunately not often the case in my
tests. so some manual cross-ref will be required here.
.. todo:: croaks on the ``kodi(1)`` manpage, a weird redirect,
which we should handle manually here. the fix, according
to ``mandoc(1)`` is to chdir to the correct relative
directory. looking at ``zshall(1)``, ``.so`` looks like
an "include" directive.
'''
command = 'mandoc -T html -Ofragment,man=../man%%S/%%N.%%S.html %(source)s'
[docs]class Man2htmlRenderer(CommandRenderer, ManpageRenderer):
'''render manpages with man2html'''
command = 'man2html %(source)s'
[docs] def postprocess(self, data):
'''process man2html output
- it doesn't return proper exit codes, look for Status header
instead. Anything 40X is bad.
- the title is in the ``NAME`` level two header (``<h2>``)
- keep only the inside of the ``<body>`` tag
- rewrite URLs to point to the right place
- remove attribution
'''
content = data['content']
if content.startswith('Status: 40'):
e = re.sub(r'^.*<title>(.*)</title>.*$', r'\1',
content, re.DOTALL | re.IGNORECASE)
raise CommandRendererError('%s failed to convert %s: %s'
% (self.command, self.source, e))
match = re.search(r'<h2>\w*NAME\w*</h2>\w*([^<]*)<', content,
re.DOTALL | re.IGNORECASE)
if match:
logger.debug('found page name %s', match.group(1))
if '-' in match.group(1):
title, description = match.group(1).split('-')[:2]
data['title'] = title.strip()
data['description'] = description.strip()
else:
data['title'] = match.group(1).strip()
content = re.sub(r'^.*<body>', '', content,
flags=re.DOTALL | re.IGNORECASE)
content = re.sub(r'</body>.*$', '', content,
flags=re.DOTALL | re.IGNORECASE)
content = re.sub(r'/cgi-bin/man/man2html\?([1-9])\+(\w+)',
r'/man/man\1/\2.\1.html', content)
content = re.sub(r'<HR>\s+This document was created by\s+<A HREF="/cgi-bin/man/man2html">.*$',
'', content, flags=re.IGNORECASE | re.DOTALL)
data['content'] = content
return data
#: quick switch to toggle default manpage rendering implementation
DefaultManpageRenderer = MandocRenderer
[docs]def find_files(directory, patterns):
'''look for file paterns in the given directory and return the right
command to run
.. todo:: this may be slow in large directories and may be
reimplemented with :func:`os.scandir` if we ever depend on
Python 3.5 or later.
:return: ``module``, ``path`` tuples
:rtype: list
'''
for root, dirs, files in os.walk(directory):
logging.debug('walking: %s %s %s', root, dirs, files)
for path in files:
path = os.path.join(root, path)
for regex, module in patterns.iteritems():
m = re.search(regex, path)
if m:
yield module, path, m
[docs]def match_jobs(files, patterns):
'''dispatch the right command for the matching pattern
:param list files: list of file paths to inspect
:param list patterns: list of tuples (``cls``,
``regex``). ``regex`` is a compiled regex
patterns to match against the pathnames,
``cls`` is a CommandRenderer subclass to run
:return: ``module``, ``path`` tuples
:rtype: list
'''
for path in files:
for regex, module in patterns.iteritems():
m = re.search(regex, path)
if m:
yield module, path, m
@click.command()
@click.pass_obj
def site(obj):
'''render the whole static site'''
suites = obj['mirror'].releases
static_dir = find_static_file(os.path.join('static'))
pattern = os.path.join(static_dir, '*.mdwn')
template = find_static_file(os.path.join('templates', 'template.html'))
logging.info('rendering files in %s with template %s',
pattern, template)
i = 0
t = time.time()
for path in glob.glob(pattern):
target = re.sub(r'.mdwn$', '.html', path)
target = os.path.join(obj['output'], os.path.basename(target))
r = MarkdownRenderer(template, dryrun=obj['dryrun'],
cache=obj['cache'])
r.render(path, target, prefix=obj['prefix'], suites=suites)
i += 1
logging.info('rendererd %d files in %s', i,
naturaldelta(time.time() - t))
logging.info('copying files')
i = 0
t = time.time()
for pattern in '*.css', '*.js':
for static in glob.glob(os.path.join(static_dir, pattern)):
i += 1
logging.debug('copying %s to %s', static, obj['output'])
shutil.copy(static, obj['output'])
picsdir = os.path.join(static_dir, 'Pics')
pics = os.path.join(picsdir, '*')
targetdir = os.path.join(obj['output'], 'Pics')
mkdirp(targetdir)
logging.debug('copying pics %s', pics)
for static in glob.glob(pics):
i += 1
logging.debug('copying %s to %s', static, targetdir)
shutil.copy(static, targetdir)
logging.info('rendererd %d files in %s', i,
naturaldelta(time.time() - t))