2013-07-04 17:47:56 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
#
|
2013-09-26 19:04:57 +00:00
|
|
|
# Copyright 2013 Pixar
|
2013-07-04 17:47:56 +00:00
|
|
|
#
|
2013-09-26 19:04:57 +00:00
|
|
|
# Licensed under the Apache License, Version 2.0 (the "Apache License")
|
|
|
|
# with the following modification; you may not use this file except in
|
|
|
|
# compliance with the Apache License and the following modification to it:
|
|
|
|
# Section 6. Trademarks. is deleted and replaced with:
|
2013-07-04 17:47:56 +00:00
|
|
|
#
|
2013-09-26 19:04:57 +00:00
|
|
|
# 6. Trademarks. This License does not grant permission to use the trade
|
|
|
|
# names, trademarks, service marks, or product names of the Licensor
|
|
|
|
# and its affiliates, except as required to comply with Section 4(c) of
|
|
|
|
# the License and to reproduce the content of the NOTICE file.
|
2013-07-04 17:47:56 +00:00
|
|
|
#
|
2013-09-26 19:04:57 +00:00
|
|
|
# You may obtain a copy of the Apache License at
|
2013-07-04 17:47:56 +00:00
|
|
|
#
|
2013-09-26 19:04:57 +00:00
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
2013-07-18 21:19:50 +00:00
|
|
|
#
|
2013-09-26 19:04:57 +00:00
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the Apache License with the above modification is
|
|
|
|
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
# KIND, either express or implied. See the Apache License for the specific
|
|
|
|
# language governing permissions and limitations under the Apache License.
|
2013-07-04 17:47:56 +00:00
|
|
|
#
|
2013-07-18 22:05:24 +00:00
|
|
|
|
2013-07-04 17:47:56 +00:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import string
|
|
|
|
import re
|
|
|
|
import HTMLParser
|
|
|
|
|
|
|
|
class HtmlToTextParser(HTMLParser.HTMLParser):
|
|
|
|
def __init__(self):
|
|
|
|
HTMLParser.HTMLParser.__init__(self)
|
|
|
|
self.m_text = []
|
|
|
|
self.m_inTitle = False
|
|
|
|
self.m_inScript = False
|
|
|
|
self.m_inStyle = False
|
|
|
|
self.m_title = ""
|
|
|
|
self.m_navigation = False
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
if self.m_inScript or self.m_inStyle:
|
|
|
|
return
|
|
|
|
text = data.strip()
|
|
|
|
if len(text) > 0:
|
|
|
|
text = re.sub('[\s]+', ' ', text)
|
|
|
|
text = re.sub('[^\.,\- a-zA-Z0-9_]+', '', text)
|
|
|
|
self.m_text.append(text + ' ')
|
|
|
|
if self.m_inTitle:
|
|
|
|
self.m_title = str(text)
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag.lower() == "title": self.m_inTitle = False
|
|
|
|
if tag.lower() == "script": self.m_inScript = False
|
|
|
|
if tag.lower() == "style": self.m_inStyle = False
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag.lower() == "title": self.m_inTitle = True
|
|
|
|
if tag.lower() == "script": self.m_inScript = True
|
|
|
|
if tag.lower() == "style": self.m_inStyle = True
|
|
|
|
if tag.lower() == "div":
|
|
|
|
for attr in attrs:
|
|
|
|
if (len(attr)>=2 and \
|
|
|
|
attr[0].lower()=="class" and \
|
|
|
|
attr[1].lower()=="navigation"):
|
|
|
|
self.m_navigation = True
|
|
|
|
|
|
|
|
def HasNavigationSection(self):
|
|
|
|
return self.m_navigation
|
|
|
|
|
|
|
|
def GetText(self):
|
|
|
|
return ''.join(self.m_text).strip()
|
|
|
|
|
|
|
|
def GetTitle(self):
|
|
|
|
return self.m_title
|
|
|
|
|
2013-07-18 18:26:54 +00:00
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
def StripHTMLComments(data):
|
|
|
|
regex = re.compile('\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>')
|
|
|
|
return regex.sub('',data)
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
def ReadNavigationTemplate( filePath ):
|
|
|
|
|
|
|
|
navHtml = ""
|
|
|
|
|
|
|
|
try:
|
|
|
|
navFile = open( filePath, "r")
|
|
|
|
except IOError:
|
|
|
|
print "Could not open file \'"+filePath+"\'"
|
|
|
|
|
|
|
|
with navFile:
|
|
|
|
print "Navigation template: \'"+filePath+"\'"
|
|
|
|
navHtml = navFile.read()
|
|
|
|
navHtml = StripHTMLComments(navHtml)
|
|
|
|
navFile.close()
|
|
|
|
navHtml = StripHTMLComments(navHtml)
|
|
|
|
|
|
|
|
return navHtml
|
|
|
|
|
2013-07-04 17:47:56 +00:00
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
def WriteIndexFile( outputFile, content ):
|
|
|
|
outputPath = os.path.dirname( outputFile )
|
|
|
|
|
|
|
|
try:
|
|
|
|
os.makedirs( outputPath );
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
print "Creating Search-Index File : \""+outputFile+"\""
|
|
|
|
|
|
|
|
f = open(outputFile, "w")
|
|
|
|
f.write(content)
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
def Usage():
|
|
|
|
print str(sys.argv[0])+" <input directory> <output directory> <html template>"
|
|
|
|
exit(1);
|
|
|
|
|
|
|
|
|
|
|
|
#-------------------------------------------------------------------------------
|
|
|
|
# Main
|
|
|
|
if (len(sys.argv)<3):
|
|
|
|
Usage()
|
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
rootDir = str(sys.argv[1])
|
|
|
|
|
|
|
|
navTemplate = str(sys.argv[2])
|
2013-07-04 17:47:56 +00:00
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
navHtml = ReadNavigationTemplate( navTemplate )
|
2013-07-04 17:47:56 +00:00
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
print "Scanning : \'"+rootDir+"\'"
|
2013-07-04 17:47:56 +00:00
|
|
|
|
|
|
|
searchIndex = 'var tipuesearch = { "pages": [ '
|
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
# recursively scan sub-directories for HTML files
|
2013-07-04 17:47:56 +00:00
|
|
|
for root, dirs, files in os.walk(rootDir):
|
2013-07-05 22:36:54 +00:00
|
|
|
|
|
|
|
# skip doxygen generated HTML
|
|
|
|
if 'doxy_html' in dirs:
|
|
|
|
dirs.remove('doxy_html')
|
|
|
|
|
2013-07-04 17:47:56 +00:00
|
|
|
for f in files:
|
|
|
|
|
|
|
|
inputFile = os.path.join(root, f)
|
|
|
|
if inputFile.endswith(".html") or inputFile.endswith(".htm") :
|
|
|
|
|
|
|
|
f = open(inputFile, "r+")
|
|
|
|
html = f.read()
|
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
# parse the ReST generated HTML
|
2013-07-04 17:47:56 +00:00
|
|
|
parser = HtmlToTextParser()
|
|
|
|
try:
|
|
|
|
parser.feed(html)
|
|
|
|
title = parser.GetTitle()
|
|
|
|
text = parser.GetText()
|
|
|
|
except HTMLParser.HTMLParseError:
|
|
|
|
continue
|
|
|
|
|
|
|
|
msg = " \""+inputFile+"\" - "
|
|
|
|
|
|
|
|
# index the contents of the page for search
|
|
|
|
if (not inputFile.lower().endswith("search.html")):
|
|
|
|
if title == "":
|
|
|
|
title = "untitled"
|
|
|
|
loc = os.path.relpath(inputFile, rootDir)
|
|
|
|
searchIndex += '{"title":"'+title+'", "text":"'+text+'", "tags": "", "loc":"'+loc+'"}, \n'
|
|
|
|
msg += "indexed - "
|
|
|
|
|
2013-07-08 01:20:46 +00:00
|
|
|
# if necessary, insert navigation html
|
2013-07-04 17:47:56 +00:00
|
|
|
if (not parser.HasNavigationSection()):
|
|
|
|
loc = string.find(html,"<body>")
|
|
|
|
html = html[:loc+6] + navHtml + html[loc+6:]
|
|
|
|
|
|
|
|
msg += "added navigation"
|
|
|
|
|
2013-07-18 18:26:54 +00:00
|
|
|
# replace the article title placeholder with the real title
|
|
|
|
if title:
|
|
|
|
html = string.replace(html,"OSD_ARTICLE_TITLE", title)
|
|
|
|
else:
|
|
|
|
html = string.replace(html,"OSD_ARTICLE_TITLE", "")
|
|
|
|
|
|
|
|
f.seek(0)
|
|
|
|
f.write(html)
|
|
|
|
f.close()
|
2013-07-04 17:47:56 +00:00
|
|
|
|
|
|
|
print msg
|
|
|
|
|
|
|
|
searchIndex = searchIndex + "]};"
|
|
|
|
|
2013-07-05 22:36:54 +00:00
|
|
|
WriteIndexFile( os.path.join(rootDir, "tipuesearch", "tipuesearch_content.js"), searchIndex )
|
2013-07-04 17:47:56 +00:00
|
|
|
|