OpenSubdiv/documentation/processHtml.py
2013-07-18 15:05:24 -07:00

197 lines
6.0 KiB
Python
Executable File

#!/usr/bin/env python
#
# Copyright 2013 Pixar
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License
# and the following modification to it: Section 6 Trademarks.
# deleted and replaced with:
#
# 6. Trademarks. This License does not grant permission to use the
# trade names, trademarks, service marks, or product names of the
# Licensor and its affiliates, except as required for reproducing
# the content of the NOTICE file.
#
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
# either express or implied. See the License for the specific
# language governing permissions and limitations under the
# License.
#
import os
import sys
import string
import re
import HTMLParser
class HtmlToTextParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.m_text = []
self.m_inTitle = False
self.m_inScript = False
self.m_inStyle = False
self.m_title = ""
self.m_navigation = False
def handle_data(self, data):
if self.m_inScript or self.m_inStyle:
return
text = data.strip()
if len(text) > 0:
text = re.sub('[\s]+', ' ', text)
text = re.sub('[^\.,\- a-zA-Z0-9_]+', '', text)
self.m_text.append(text + ' ')
if self.m_inTitle:
self.m_title = str(text)
def handle_endtag(self, tag):
if tag.lower() == "title": self.m_inTitle = False
if tag.lower() == "script": self.m_inScript = False
if tag.lower() == "style": self.m_inStyle = False
def handle_starttag(self, tag, attrs):
if tag.lower() == "title": self.m_inTitle = True
if tag.lower() == "script": self.m_inScript = True
if tag.lower() == "style": self.m_inStyle = True
if tag.lower() == "div":
for attr in attrs:
if (len(attr)>=2 and \
attr[0].lower()=="class" and \
attr[1].lower()=="navigation"):
self.m_navigation = True
def HasNavigationSection(self):
return self.m_navigation
def GetText(self):
return ''.join(self.m_text).strip()
def GetTitle(self):
return self.m_title
#-------------------------------------------------------------------------------
def StripHTMLComments(data):
regex = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
return regex.sub('',data)
#-------------------------------------------------------------------------------
def ReadNavigationTemplate( filePath ):
navHtml = ""
try:
navFile = open( filePath, "r")
except IOError:
print "Could not open file \'"+filePath+"\'"
with navFile:
print "Navigation template: \'"+filePath+"\'"
navHtml = navFile.read()
navHtml = StripHTMLComments(navHtml)
navFile.close()
navHtml = StripHTMLComments(navHtml)
return navHtml
#-------------------------------------------------------------------------------
def WriteIndexFile( outputFile, content ):
outputPath = os.path.dirname( outputFile )
try:
os.makedirs( outputPath );
except:
pass
print "Creating Search-Index File : \""+outputFile+"\""
f = open(outputFile, "w")
f.write(content)
f.close()
#-------------------------------------------------------------------------------
def Usage():
print str(sys.argv[0])+" <input directory> <output directory> <html template>"
exit(1);
#-------------------------------------------------------------------------------
# Main
if (len(sys.argv)<3):
Usage()
rootDir = str(sys.argv[1])
navTemplate = str(sys.argv[2])
navHtml = ReadNavigationTemplate( navTemplate )
print "Scanning : \'"+rootDir+"\'"
searchIndex = 'var tipuesearch = { "pages": [ '
# recursively scan sub-directories for HTML files
for root, dirs, files in os.walk(rootDir):
# skip doxygen generated HTML
if 'doxy_html' in dirs:
dirs.remove('doxy_html')
for f in files:
inputFile = os.path.join(root, f)
if inputFile.endswith(".html") or inputFile.endswith(".htm") :
f = open(inputFile, "r+")
html = f.read()
# parse the ReST generated HTML
parser = HtmlToTextParser()
try:
parser.feed(html)
title = parser.GetTitle()
text = parser.GetText()
except HTMLParser.HTMLParseError:
continue
msg = " \""+inputFile+"\" - "
# index the contents of the page for search
if (not inputFile.lower().endswith("search.html")):
if title == "":
title = "untitled"
loc = os.path.relpath(inputFile, rootDir)
searchIndex += '{"title":"'+title+'", "text":"'+text+'", "tags": "", "loc":"'+loc+'"}, \n'
msg += "indexed - "
# if necessary, insert navigation html
if (not parser.HasNavigationSection()):
loc = string.find(html,"<body>")
html = html[:loc+6] + navHtml + html[loc+6:]
msg += "added navigation"
# replace the article title placeholder with the real title
if title:
html = string.replace(html,"OSD_ARTICLE_TITLE", title)
else:
html = string.replace(html,"OSD_ARTICLE_TITLE", "")
f.seek(0)
f.write(html)
f.close()
print msg
searchIndex = searchIndex + "]};"
WriteIndexFile( os.path.join(rootDir, "tipuesearch", "tipuesearch_content.js"), searchIndex )