OpenSubdiv/documentation/processHtml.py

196 lines
6.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python
#
# Copyright 2013 Pixar
#
# Licensed under the Apache License, Version 2.0 (the "Apache License")
# with the following modification; you may not use this file except in
# compliance with the Apache License and the following modification to it:
# Section 6. Trademarks. is deleted and replaced with:
#
# 6. Trademarks. This License does not grant permission to use the trade
# names, trademarks, service marks, or product names of the Licensor
# and its affiliates, except as required to comply with Section 4(c) of
# the License and to reproduce the content of the NOTICE file.
#
# You may obtain a copy of the Apache License at
#
# http://www.apache.org/licenses/LICENSE-2.0
2013-07-18 21:19:50 +00:00
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the Apache License with the above modification is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the Apache License for the specific
# language governing permissions and limitations under the Apache License.
#
2013-07-18 22:05:24 +00:00
import os
import sys
import string
import re
import HTMLParser
class HtmlToTextParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.m_text = []
self.m_inTitle = False
self.m_inScript = False
self.m_inStyle = False
self.m_title = ""
self.m_navigation = False
def handle_data(self, data):
if self.m_inScript or self.m_inStyle:
return
text = data.strip()
if len(text) > 0:
text = re.sub('[\s]+', ' ', text)
text = re.sub('[^\.,\- a-zA-Z0-9_]+', '', text)
self.m_text.append(text + ' ')
if self.m_inTitle:
self.m_title = str(text)
def handle_endtag(self, tag):
if tag.lower() == "title": self.m_inTitle = False
if tag.lower() == "script": self.m_inScript = False
if tag.lower() == "style": self.m_inStyle = False
def handle_starttag(self, tag, attrs):
if tag.lower() == "title": self.m_inTitle = True
if tag.lower() == "script": self.m_inScript = True
if tag.lower() == "style": self.m_inStyle = True
if tag.lower() == "div":
for attr in attrs:
if (len(attr)>=2 and \
attr[0].lower()=="class" and \
attr[1].lower()=="navigation"):
self.m_navigation = True
def HasNavigationSection(self):
return self.m_navigation
def GetText(self):
return ''.join(self.m_text).strip()
def GetTitle(self):
return self.m_title
#-------------------------------------------------------------------------------
def StripHTMLComments(data):
regex = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
return regex.sub('',data)
#-------------------------------------------------------------------------------
def ReadNavigationTemplate( filePath ):
navHtml = ""
try:
navFile = open( filePath, "r")
except IOError:
print "Could not open file \'"+filePath+"\'"
with navFile:
print "Navigation template: \'"+filePath+"\'"
navHtml = navFile.read()
navHtml = StripHTMLComments(navHtml)
navFile.close()
navHtml = StripHTMLComments(navHtml)
return navHtml
#-------------------------------------------------------------------------------
def WriteIndexFile( outputFile, content ):
outputPath = os.path.dirname( outputFile )
try:
os.makedirs( outputPath );
except:
pass
print "Creating Search-Index File : \""+outputFile+"\""
f = open(outputFile, "w")
f.write(content)
f.close()
#-------------------------------------------------------------------------------
def Usage():
print str(sys.argv[0])+" <input directory> <output directory> <html template>"
exit(1);
#-------------------------------------------------------------------------------
# Main
if (len(sys.argv)<3):
Usage()
rootDir = str(sys.argv[1])
navTemplate = str(sys.argv[2])
navHtml = ReadNavigationTemplate( navTemplate )
print "Scanning : \'"+rootDir+"\'"
searchIndex = 'var tipuesearch = { "pages": [ '
# recursively scan sub-directories for HTML files
for root, dirs, files in os.walk(rootDir):
# skip doxygen generated HTML
if 'doxy_html' in dirs:
dirs.remove('doxy_html')
for f in files:
inputFile = os.path.join(root, f)
if inputFile.endswith(".html") or inputFile.endswith(".htm") :
f = open(inputFile, "r+")
html = f.read()
# parse the ReST generated HTML
parser = HtmlToTextParser()
try:
parser.feed(html)
title = parser.GetTitle()
text = parser.GetText()
except HTMLParser.HTMLParseError:
continue
msg = " \""+inputFile+"\" - "
# index the contents of the page for search
if (not inputFile.lower().endswith("search.html")):
if title == "":
title = "untitled"
loc = os.path.relpath(inputFile, rootDir)
searchIndex += '{"title":"'+title+'", "text":"'+text+'", "tags": "", "loc":"'+loc+'"}, \n'
msg += "indexed - "
# if necessary, insert navigation html
if (not parser.HasNavigationSection()):
loc = string.find(html,"<body>")
html = html[:loc+6] + navHtml + html[loc+6:]
msg += "added navigation"
# replace the article title placeholder with the real title
if title:
html = string.replace(html,"OSD_ARTICLE_TITLE", title)
else:
html = string.replace(html,"OSD_ARTICLE_TITLE", "")
f.seek(0)
f.write(html)
f.close()
print msg
searchIndex = searchIndex + "]};"
WriteIndexFile( os.path.join(rootDir, "tipuesearch", "tipuesearch_content.js"), searchIndex )