#!/usr/bin/env python
#
# Copyright 2013 Pixar
#
# Licensed under the Apache License, Version 2.0 (the "Apache License")
# with the following modification; you may not use this file except in
# compliance with the Apache License and the following modification to it:
# Section 6. Trademarks. is deleted and replaced with:
#
# 6. Trademarks. This License does not grant permission to use the trade
# names, trademarks, service marks, or product names of the Licensor
# and its affiliates, except as required to comply with Section 4(c) of
# the License and to reproduce the content of the NOTICE file.
#
# You may obtain a copy of the Apache License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the Apache License with the above modification is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the Apache License for the specific
# language governing permissions and limitations under the Apache License.
#
from __future__ import print_function
import os
import sys
import string
import re
from html.parser import HTMLParser
class HtmlToTextParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.m_text = []
self.m_inTitle = False
self.m_inScript = False
self.m_inStyle = False
self.m_title = ""
self.m_navigation = False
def handle_data(self, data):
if self.m_inScript or self.m_inStyle:
return
text = data.strip()
if len(text) > 0:
text = re.sub('[\s]+', ' ', text)
text = re.sub('[^\.,\- a-zA-Z0-9_]+', '', text)
self.m_text.append(text + ' ')
if self.m_inTitle:
self.m_title = str(text)
def handle_endtag(self, tag):
if tag.lower() == "title": self.m_inTitle = False
if tag.lower() == "script": self.m_inScript = False
if tag.lower() == "style": self.m_inStyle = False
def handle_starttag(self, tag, attrs):
if tag.lower() == "title": self.m_inTitle = True
if tag.lower() == "script": self.m_inScript = True
if tag.lower() == "style": self.m_inStyle = True
if tag.lower() == "div":
for attr in attrs:
if (len(attr)>=2 and \
attr[0].lower()=="class" and \
attr[1].lower()=="navigation"):
self.m_navigation = True
def HasNavigationSection(self):
return self.m_navigation
def GetText(self):
return ''.join(self.m_text).strip()
def GetTitle(self):
return self.m_title
#-------------------------------------------------------------------------------
def StripHTMLComments(data):
regex = re.compile('\')
return regex.sub('',data)
#-------------------------------------------------------------------------------
def ReadNavigationTemplate( filePath ):
navHtml = ""
try:
navFile = open( filePath, "r")
except IOError:
print("Could not open file \'"+filePath+"\'")
with navFile:
print("Navigation template: \'"+filePath+"\'")
navHtml = navFile.read()
navHtml = StripHTMLComments(navHtml)
navFile.close()
navHtml = StripHTMLComments(navHtml)
return navHtml
#-------------------------------------------------------------------------------
def WriteIndexFile( outputFile, content ):
outputPath = os.path.dirname( outputFile )
try:
os.makedirs( outputPath )
except:
pass
print("Creating Search-Index File : \""+outputFile+"\"")
f = open(outputFile, "w")
f.write(content)
f.close()
#-------------------------------------------------------------------------------
def Usage():
print(str(sys.argv[0])+"