Source code for debmans.renderer

#!/usr/bin/python3
# coding: utf-8

# Copyright (C) 2016 Antoine Beaupré <anarcat@debian.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import division, absolute_import
from __future__ import print_function

import glob
import os
import re
import shlex
import shutil
import subprocess
import time

import logging
logger = logging.getLogger(__name__)

import click
from jinja2 import Template
from markdown import markdown
# workaround missing import issue in 0.5
from humanize.time import naturaldelta

from debmans.utils import mkdirp, find_static_file


@click.command()
@click.option('-s', '--srcdir', default='.',
              help='where the source manuals are (default: output of the extract command)')
@click.pass_obj
def render(obj, srcdir):
    '''render documentation to HTML

    this looks for patterns matching a certain regex in the given
    SRCDIR directory

    .. note:: this assumes files have an extension that should be
              stripped. for manpages, this should generally be
              ``.gz``. if manpages are not compressed, this will break
              section support.

    .. todo:: document that compressed manpages are mandatory
    '''
    output = obj['output']
    mirror = obj['mirror']
    patterns = obj['patterns']

    # get modified paths from extractor, if it was also called
    if 'changed_paths' in obj:
        if srcdir:
            logging.warn('ignoring --srcdir parameter because called with extractor')
            srcdir = None
        logging.info('received %d paths from extractor',
                     len(obj['changed_paths']))
        filelist = match_jobs(obj['changed_paths'], patterns)
    else:
        logging.info('looking for patterns %s in %s', patterns, srcdir)
        filelist = find_files(srcdir, patterns)

    if obj['progress']:
        progress = click.progressbar
    else:
        class fake_progress:
            def __init__(self, it, label=None):
                self.it = it

            def __enter__(self):
                return self.it

            def __exit__(self, type, value, traceback):
                pass
        progress = fake_progress
    i = 0
    t = time.time()

    with progress(list(filelist),
                  label='rendering manpages') as bar:
        for renderer, source, match in bar:
            if srcdir:
                # from srcdir, replace it with output dir
                target = source.replace(srcdir, output, 1)
            else:
                # from extractor: already in the output dir
                target = source
            # replace .gz extension with .html
            base, ext = os.path.splitext(target)
            target = os.path.abspath(base + '.html')
            try:
                renderer.render(source, target,
                                prefix=obj['prefix'],
                                suites=mirror.releases)
            except CommandRendererError as e:
                logging.warn('%s failed to convert %s: %s',
                             renderer, source, e)
            i += 1
    logger.info('rendered %d manpages in %s', i,
                naturaldelta(time.time() - t))


[docs]class JinjaRenderer(object):
    '''render Jinja templates using given parameters, caching and
    simulation

    this is basically an extension of the Template class, but extended
    so we can easily pass paths (instead of strings) in and out.

    .. todo:: we should probably have derived Template directly here.
    '''

    def __init__(self, template, cache=True, dryrun=False):
        '''create a renderer

        :param str template: path to the Jinja2 template
        :param bool cache: do not overwrite output file if newer
        :param bool dryrun: do not write anything in any case, useful
                            to test cache detection
        '''
        #: template to use to render the data
        self.template = template
        #: if we should check timestamps before writing
        self.cache = cache
        #: if True, do not write
        self.dryrun = dryrun
        #: source file currently processed
        self.source = None

[docs]    def generated_time(self):
        '''handy function to add timestamp to footers'''
        return 'Generated on %s' % time.strftime('%Y-%m-%d %H:%M:%S%Z')

[docs]    def render(self, target, **data):
        '''render template with given data

        if ``pageinfo`` isn't provided in :data:`data`, it is set to
        the output of :func:`generated_time`.

        :param str target: path to the target file
        :param dict data: set of parameters passed to :meth:`~jinja2.Template.render`

        '''
        if 'pageinfo' not in data:
            data['pageinfo'] = self.generated_time()
        if self.uptodate(target):
            logging.debug('%s is up to date with template %s',
                          target, self.template)
        else:
            if self.dryrun:
                logging.debug('dryrun: not writing file %s with template %s',
                              target, self.template)
                return
            logging.debug('writing file %s with template %s',
                          target, self.template)
            mkdirp(os.path.dirname(target))
            with open(self.template, 'r') as templatefile,\
                    open(target, 'w') as targetfile:
                tmpl = Template(templatefile.read().decode('utf-8'))
                targetfile.write(tmpl.render(data).encode('utf-8'))

[docs]    def uptodate(self, target):
        '''check if the target file is newer than template

        also checks the :attr:`source` attribute if it is set, which
        allos for subclasses to add a file to check.'''
        return self.cache and os.path.exists(target) \
            and os.stat(self.template).st_mtime <= os.stat(target).st_mtime \
            and (self.source is None
                 or os.stat(self.source).st_mtime <= os.stat(target).st_mtime)


[docs]class MarkdownRenderer(JinjaRenderer):
    '''render markdown source files with a jinja template'''
[docs]    def render(self, source, target, **data):
        '''render the given source file

        :param str source: path to the Markdown source file
        :param str target: passed to :func:`JinjaRenderer.render`
        '''
        self.source = source
        with open(source, 'r') as sourcefile:
            html = markdown(sourcefile.read())
            super(MarkdownRenderer, self).render(target, content=html, **data)


[docs]class CommandRenderer(JinjaRenderer):
    '''a simple template-based rendering system

    a file is passed as an argument to a command and the output is
    written into the given template, in the `{{content}}` Jinja2
    element.

    this is meant to be subclassed in command-specific renderers.

    those can also not even be command-based, as long as they have the
    following parameters:

    - ``pattern``: regular expression pattern for this class

    - ``render(source, target, **data)``: render the given source file
      into the target file, with the attached Jinja data. at least
      ``content`` is expected in there, but ``description`` and
      ``title`` are also encouraged, those should match the template.

    '''
    def __init__(self, template, command=None, cache=True, dryrun=False):
        '''create a renderer with the given command

        :param str command: command to launch for this renderer

        Other parameters passed as is to
        :func:`JinjaRenderer.__init__`
        '''
        if command is not None:
            self.command = command
        super(CommandRenderer, self).__init__(template, cache, dryrun)

[docs]    def postprocess(self, data):
        '''modify the data sent to the template after execution

        this allows subclasses to intervene between the command call
        and the render call.

        by default does nothing
        '''
        
[docs]    def render(self, source, target, **data):
        '''render the given source file using external command defined in
        constructor

        does not call command in :attr:`dryrun` mode.

        .. todo:: support %(target)s instead of standard output, if
                  necessary?

        :param str source: path to the source file
        :param str target: path to the output file
        :param dict data: remaining arguments passed as is to
                          :func:`JinjaRenderer.render`
        :raises :class:`CommandRendererError`: if command fails to convert given page

        '''
        # w3m parser requires absolute paths
        self.source = os.path.abspath(source)
        # build a safe command using the shell lexer
        command = [x % {'source': self.source} for x in shlex.split(self.command)]
        if self.uptodate(target):
            logging.debug('%s is up to date, not running %s', target, command)
            return
        if self.dryrun:
            logger.debug('dryrun: not running command %s', command)
            data['content'] = ''
        else:
            logger.debug('running command %s', command)
            try:
                content = subprocess.check_output(command,
                                                  stderr=subprocess.PIPE)
            except subprocess.CalledProcessError as e:
                raise CommandRendererError('%s failed to convert %s: %s'
                                           % (self.command, self.source, e))
            data['content'] = content.decode('utf-8', 'replace')
        self.postprocess(data)
        super(CommandRenderer, self).render(target, **data)


[docs]class CommandRendererError(RuntimeError):
    '''error raised when man2html fails to render the manpage'''
    pass


[docs]class ManpageRenderer(object):
    '''abstract class to store the manpage regex pattern'''

    #: default pattern for manpages
    pattern = r'/(?:(?P<suite>\w+)/)?(?P<path>man/(?:(?P<locale>\w+)/)?man[1-9]/(?P<name>.+)\.(?P<section>[1-9]\w*)(?:\.gz))?$'


[docs]class W3mRenderer(CommandRenderer, ManpageRenderer):
    '''render manpages with w3m'''

    #: path to w3m converter
    command = '/usr/lib/w3m/cgi-bin/w3mman2html.cgi "quit=1&local=%(source)s"'

[docs]    def postprocess(self, data):
        '''process w3m parser output'''
        content = data['content']
        content = re.sub(r'<a href="file:///[^?]*\?([^(]*)\(([^)]*)\)">',
                         r'<a href="../man\2/\1.\2.html">', content)
        # copy-pasted from Man2htmlRenderer
        content = re.sub(r'^.*<body>', '', content,
                         flags=re.DOTALL | re.IGNORECASE)
        content = re.sub(r'</body>.*$', '', content,
                         flags=re.DOTALL | re.IGNORECASE)
        # find the page title
        match = re.search(r'^<b>NAME</b>\s*^\s+(.*?)^<b>', content,
                          re.DOTALL | re.MULTILINE)
        if match:
            logger.debug('found page name %s', match.group(1).strip())
            if '-' in match.group(1):
                title, description = match.group(1).split('-')[:2]
                data['title'] = title.strip()
                data['description'] = description.strip()
            else:
                data['title'] = match.group(1).strip()
        # fix broken headings
        content = re.sub(r'\n<b>([^<]*)</b> +<b>', r'<b>\1 ', content)
        # turn section headers into H2
        content = re.sub(r'\n<b>([\w ]+)</b>', r'\n<h2>\1</h2>',
                         content, re.UNICODE)
        data['content'] = content


[docs]class MandocRenderer(CommandRenderer, ManpageRenderer):
    '''render pages with mandoc

    .. todo:: this assumes cross-references are done with the ``.Xr``
              macro, which is unfortunately not often the case in my
              tests. so some manual cross-ref will be required here.

    .. todo:: croaks on the ``kodi(1)`` manpage, a weird redirect,
              which we should handle manually here. the fix, according
              to ``mandoc(1)`` is to chdir to the correct relative
              directory. looking at ``zshall(1)``, ``.so`` looks like
              an "include" directive.
    '''
    command = 'mandoc -T html -Ofragment,man=../man%%S/%%N.%%S.html %(source)s'


[docs]class Man2htmlRenderer(CommandRenderer, ManpageRenderer):
    '''render manpages with man2html'''
    command = 'man2html %(source)s'

[docs]    def postprocess(self, data):
        '''process man2html output

        - it doesn't return proper exit codes, look for Status header
          instead. Anything 40X is bad.
        - the title is in the ``NAME`` level two header (``<h2>``)
        - keep only the inside of the ``<body>`` tag
        - rewrite URLs to point to the right place
        - remove attribution
        '''
        content = data['content']
        if content.startswith('Status: 40'):
            e = re.sub(r'^.*<title>(.*)</title>.*$', r'\1',
                       content, re.DOTALL | re.IGNORECASE)
            raise CommandRendererError('%s failed to convert %s: %s'
                                       % (self.command, self.source, e))
        match = re.search(r'<h2>\w*NAME\w*</h2>\w*([^<]*)<', content,
                          re.DOTALL | re.IGNORECASE)
        if match:
            logger.debug('found page name %s', match.group(1))
            if '-' in match.group(1):
                title, description = match.group(1).split('-')[:2]
                data['title'] = title.strip()
                data['description'] = description.strip()
            else:
                data['title'] = match.group(1).strip()
        content = re.sub(r'^.*<body>', '', content,
                         flags=re.DOTALL | re.IGNORECASE)
        content = re.sub(r'</body>.*$', '', content,
                         flags=re.DOTALL | re.IGNORECASE)
        content = re.sub(r'/cgi-bin/man/man2html\?([1-9])\+(\w+)',
                         r'/man/man\1/\2.\1.html', content)
        content = re.sub(r'<HR>\s+This document was created by\s+<A HREF="/cgi-bin/man/man2html">.*$',
                         '', content, flags=re.IGNORECASE | re.DOTALL)
        data['content'] = content
        return data


#: quick switch to toggle default manpage rendering implementation
DefaultManpageRenderer = MandocRenderer


[docs]def find_files(directory, patterns):
    '''look for file paterns in the given directory and return the right
    command to run

    .. todo:: this may be slow in large directories and may be
              reimplemented with :func:`os.scandir` if we ever depend on
              Python 3.5 or later.

    :return: ``module``, ``path`` tuples
    :rtype: list
    '''
    for root, dirs, files in os.walk(directory):
        logging.debug('walking: %s %s %s', root, dirs, files)
        for path in files:
            path = os.path.join(root, path)
            for regex, module in patterns.iteritems():
                m = re.search(regex, path)
                if m:
                    yield module, path, m


[docs]def match_jobs(files, patterns):
    '''dispatch the right command for the matching pattern

    :param list files: list of file paths to inspect
    :param list patterns: list of tuples (``cls``,
                          ``regex``). ``regex`` is a compiled regex
                          patterns to match against the pathnames,
                          ``cls`` is a CommandRenderer subclass to run
    :return: ``module``, ``path`` tuples
    :rtype: list
    '''
    for path in files:
        for regex, module in patterns.iteritems():
            m = re.search(regex, path)
            if m:
                yield module, path, m


@click.command()
@click.pass_obj
def site(obj):
    '''render the whole static site'''
    suites = obj['mirror'].releases
    static_dir = find_static_file(os.path.join('static'))
    pattern = os.path.join(static_dir, '*.mdwn')
    template = find_static_file(os.path.join('templates', 'template.html'))
    logging.info('rendering files in %s with template %s',
                 pattern, template)
    i = 0
    t = time.time()
    for path in glob.glob(pattern):
        target = re.sub(r'.mdwn$', '.html', path)
        target = os.path.join(obj['output'], os.path.basename(target))
        r = MarkdownRenderer(template, dryrun=obj['dryrun'],
                             cache=obj['cache'])
        r.render(path, target, prefix=obj['prefix'], suites=suites)
        i += 1
    logging.info('rendererd %d files in %s', i,
                 naturaldelta(time.time() - t))
    logging.info('copying files')
    i = 0
    t = time.time()
    for pattern in '*.css', '*.js':
        for static in glob.glob(os.path.join(static_dir, pattern)):
            i += 1
            logging.debug('copying %s to %s', static, obj['output'])
            shutil.copy(static, obj['output'])
    picsdir = os.path.join(static_dir, 'Pics')
    pics = os.path.join(picsdir, '*')
    targetdir = os.path.join(obj['output'], 'Pics')
    mkdirp(targetdir)
    logging.debug('copying pics %s', pics)
    for static in glob.glob(pics):
        i += 1
        logging.debug('copying %s to %s', static, targetdir)
        shutil.copy(static, targetdir)
    logging.info('rendererd %d files in %s', i,
                 naturaldelta(time.time() - t))