gtk2/docs/reference/gtk/gtk-markdown-to-docbook

#!/usr/bin/python3
#
# Call pandoc to convert markdown to docbook, then expand gtk-doc
# abbreviations (|[ ]|, function(), #object, %constant, etc)

import sys
import re
import tempfile
import os.path
import subprocess

# The following code is taken from gtk-doc

def ExpandAbbreviations(symbol, text):
    # Convert '@param()'
    text = re.sub(r'(\A|[^\\])\@(\w+((\.|->)\w+)*)\s*\(\)', r'\1<parameter>\2()</parameter>', text)

    # Convert 'function()' or 'macro()'.
    # if there is abc_*_def() we don't want to make a link to _def()
    # FIXME: also handle abc(def(....)) : but that would need to be done recursively :/
    def f1(m):
        return m.group(1) + MakeXRef(m.group(2), tagify(m.group(2) + "()", "function"))
    text = re.sub(r'([^\*.\w])(\w+)\s*\(\)', f1, text)
    # handle #Object.func()
    text = re.sub(r'(\A|[^\\])#([\w\-:\.]+[\w]+)\s*\(\)', f1, text)

    # Convert '@param', but not '\@param'.
    text = re.sub(r'(\A|[^\\])\@(\w+((\.|->)\w+)*)', r'\1<parameter>\2</parameter>', text)
    text = re.sub(r'/\\\@', r'\@', text)

    # Convert '%constant', but not '\%constant'.
    # Also allow negative numbers, e.g. %-1.
    def f2(m):
        return m.group(1) + MakeXRef(m.group(2), tagify(m.group(2), "literal"))

    text = re.sub(r'(\A|[^\\])\%(-?\w+)', f2, text)
    text = re.sub(r'\\\%', r'\%', text)

    # Convert '#symbol', but not '\#symbol'.

    # Only convert #foo after a space to avoid interfering with
    # fragment identifiers in urls
    def f3(m):
        return m.group(1) + MakeHashXRef(m.group(2), "type")

    text = re.sub(r'(\A|[ ])#([\w\-:\.]+[\w]+)', f3, text)
    text = re.sub(r'\\#', '#', text)

    return text

# Standard C preprocessor directives, which we ignore for '#' abbreviations.
PreProcessorDirectives = {
    'assert', 'define', 'elif', 'else', 'endif', 'error', 'if', 'ifdef', 'ifndef',
    'include', 'line', 'pragma', 'unassert', 'undef', 'warning'
}

def MakeHashXRef(symbol, tag):
    text = symbol

    # Check for things like '#include', '#define', and skip them.
    if symbol in PreProcessorDirectives:
        return "#%s" % symbol

    # Get rid of special suffixes ('-struct','-enum').
    text = re.sub(r'-struct$', '', text)
    text = re.sub(r'-enum$', '', text)

    # If the symbol is in the form "Object::signal", then change the symbol to
    # "Object-signal" and use "signal" as the text.
    if '::' in symbol:
        o, s = symbol.split('::', 1)
        symbol = '%s-%s' % (o, s)
        text = u'“' + s + u'”'

    # If the symbol is in the form "Object:property", then change the symbol to
    # "Object--property" and use "property" as the text.
    if ':' in symbol:
        o, p = symbol.split(':', 1)
        symbol = '%s--%s' % (o, p)
        text = u'“' + p + u'”'

    if tag != '':
        text = tagify(text, tag)

    return MakeXRef(symbol, text)

def MakeXRef(symbol, text=None):
    """This returns a cross-reference link to the given symbol.

    Though it doesn't try to do this for a few standard C types that it knows
    won't be in the documentation.

    Args:
        symbol (str): the symbol to try to create a XRef to.
        text (str): text to put inside the XRef, defaults to symbol

    Returns:
        str: a docbook link
    """
    symbol = symbol.strip()
    if not text:
        text = symbol

        # Get rid of special suffixes ('-struct','-enum').
        text = re.sub(r'-struct$', '', text)
        text = re.sub(r'-enum$', '', text)

    if ' ' in symbol:
        return text

    symbol_id = CreateValidSGMLID(symbol)
    return "<link linkend=\"%s\">%s</link>" % (symbol_id, text)

def CreateValidSGMLID(xml_id):
    """Creates a valid SGML 'id' from the given string.

    According to http://www.w3.org/TR/html4/types.html#type-id "ID and NAME
    tokens must begin with a letter ([A-Za-z]) and may be followed by any number
    of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"),
    and periods (".")."

    When creating SGML IDS, we append ":CAPS" to all all-caps identifiers to
    prevent name clashes (SGML ids are case-insensitive). (It basically never is
    the case that mixed-case identifiers would collide.)

    Args:
      id (str): The text to be converted into a valid SGML id.

    Returns:
      str: The converted id.
    """

    # Special case, '_' would end up as '' so we use 'gettext-macro' instead.
    if xml_id == '_':
        return "gettext-macro"

    xml_id = re.sub(r'[,;]', '', xml_id)
    xml_id = re.sub(r'[_ ]', '-', xml_id)
    xml_id = re.sub(r'^-+', '', xml_id)
    xml_id = xml_id.replace('::', '-')
    xml_id = xml_id.replace(':', '--')

    # Append ":CAPS" to all all-caps identifiers
    # FIXME: there are some inconsistencies here, we have index files containing e.g. TRUE--CAPS
    if xml_id.isupper() and not xml_id.endswith('-CAPS'):
        xml_id += ':CAPS'

    return xml_id

def tagify(text, elem):
    # Adds a tag around some text.
    # e.g tagify("Text", "literal") => "<literal>Text</literal>".
    return '<' + elem + '>' + text + '</' + elem + '>'

# End of gtk-doc excerpts

MarkdownExtensions = {
  '-auto_identifiers', # we use explicit identifiers where needed
  '+header_attributes', # for explicit identifiers
  '+blank_before_header', # helps with gtk-doc #Object abbreviations
  '+compact_definition_lists', # to replace <variablelist>
  '+pipe_tables',
  '+backtick_code_blocks', # to replace |[ ]|
  '+fenced_code_attributes', # to add language annotations
  '-raw_html', # to escape literal tags like <child> in input
  '+startnum', # to have interrupted lists in the q&a part
}

def ConvertToDocbook(infile, outfile):
    basename = os.path.basename(infile)
    if basename.startswith('section'):
        division='section'
    else:
        division='chapter'
    input_format = "markdown" + "".join(MarkdownExtensions)
    output_format = "docbook"
    subprocess.check_call(["pandoc", infile, "-o", outfile,
                           "--from=" + input_format,
                           "--to=" + output_format,
                           "--standalone",
                           "--top-level-division=" + division])

def ExpandGtkDocAbbreviations(infile, outfile):
    contents = open(infile, 'r', encoding='utf-8').read()
    with open(outfile, 'w', encoding='utf-8') as out:
        out.write(ExpandAbbreviations("file", contents))


if __name__ == '__main__':
    tmp = tempfile.mktemp()
    ConvertToDocbook(sys.argv[1], tmp)
    ExpandGtkDocAbbreviations(tmp, sys.argv[2])
    os.remove(tmp)