|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectorg.cyberneko.html.HTMLScanner
public class HTMLScanner
A simple HTML scanner. This scanner makes no attempt to balance tags or fix other problems in the source document — it just scans what it can and generates XNI document "events", ignoring errors of all kinds.
This component recognizes the following features:
This component recognizes the following properties:
HTMLElements
,
HTMLEntities
Nested Class Summary | |
---|---|
class |
HTMLScanner.ContentScanner
The primary HTML document scanner. |
static class |
HTMLScanner.CurrentEntity
Current entity. |
protected static class |
HTMLScanner.LocationItem
Location infoset item. |
static class |
HTMLScanner.PlaybackInputStream
A playback input stream. |
static interface |
HTMLScanner.Scanner
Basic scanner interface. |
class |
HTMLScanner.SpecialScanner
Special scanner used for elements whose content needs to be scanned as plain text, ignoring markup such as elements and entity references. |
Field Summary | |
---|---|
protected static java.lang.String |
AUGMENTATIONS
Include infoset augmentations. |
static java.lang.String |
CDATA_SECTIONS
Scan CDATA sections. |
protected static java.lang.reflect.Method |
CHARSET_forName
Charset#forName method, if available. |
protected static boolean |
DEBUG_CALLBACKS
Set to true to debug callbacks. |
protected static java.lang.reflect.Method |
DECODER_averageCharsPerByte
CharsetDecoder#averageCharsPerByte method, if available. |
protected static int |
DEFAULT_BUFFER_SIZE
Default buffer size. |
protected static java.lang.String |
DEFAULT_ENCODING
Default encoding. |
protected static java.lang.String |
DOCTYPE_PUBID
Doctype declaration public identifier. |
protected static java.lang.String |
DOCTYPE_SYSID
Doctype declaration system identifier. |
protected static java.lang.String |
ERROR_REPORTER
Error reporter. |
protected boolean |
fAugmentations
Augmentations. |
protected int |
fBeginColumnNumber
Beginning column number. |
protected int |
fBeginLineNumber
Beginning line number. |
protected HTMLScanner.PlaybackInputStream |
fByteStream
The playback byte stream. |
protected boolean |
fCDATASections
CDATA sections. |
protected HTMLScanner.Scanner |
fContentScanner
Content scanner. |
protected HTMLScanner.CurrentEntity |
fCurrentEntity
Current entity. |
protected java.util.Stack |
fCurrentEntityStack
The current entity stack. |
protected java.lang.String |
fDefaultIANAEncoding
Default encoding. |
protected java.lang.String |
fDoctypePubid
Doctype declaration public identifier. |
protected java.lang.String |
fDoctypeSysid
Doctype declaration system identifier. |
protected org.apache.xerces.xni.XMLDocumentHandler |
fDocumentHandler
The document handler. |
protected int |
fElementCount
Element count. |
protected int |
fElementDepth
Element depth. |
protected int |
fEndColumnNumber
Ending column number. |
protected int |
fEndLineNumber
Ending line number. |
protected HTMLErrorReporter |
fErrorReporter
Error reporter. |
protected boolean |
fFixWindowsCharRefs
Fix Microsoft Windows® character entity references. |
protected java.lang.String |
fIANAEncoding
Auto-detected IANA encoding. |
protected boolean |
fIgnoreSpecifiedCharset
Ignore specified character set. |
protected boolean |
fInsertDoctype
Insert document type declaration. |
protected boolean |
fIso8859Encoding
True if the encoding matches "ISO-8859-*". |
static java.lang.String |
FIX_MSWINDOWS_REFS
Fix Microsoft Windows® character entity references. |
protected java.lang.String |
fJavaEncoding
Auto-detected Java encoding. |
protected short |
fNamesAttrs
Modify HTML attribute names. |
protected short |
fNamesElems
Modify HTML element names. |
protected boolean |
fNormalizeAttributes
Normalize attribute values. |
protected boolean |
fNotifyCharRefs
Notify character entity references. |
protected boolean |
fNotifyHtmlBuiltinRefs
Notify HTML built-in general entity references. |
protected boolean |
fNotifyXmlBuiltinRefs
Notify XML built-in general entity references. |
protected boolean |
fOverrideDoctype
Override doctype declaration public and system identifiers. |
protected boolean |
fReportErrors
Report errors. |
protected HTMLScanner.Scanner |
fScanner
The current scanner. |
protected short |
fScannerState
The current scanner state. |
protected boolean |
fScriptStripCDATADelims
Strip CDATA delimiters from SCRIPT tags. |
protected boolean |
fScriptStripCommentDelims
Strip comment delimiters from SCRIPT tags. |
protected HTMLScanner.SpecialScanner |
fSpecialScanner
Special scanner used for elements whose content needs to be scanned as plain text, ignoring markup such as elements and entity references. |
protected org.apache.xerces.xni.XMLString |
fString
String. |
protected org.apache.xerces.util.XMLStringBuffer |
fStringBuffer
String buffer. |
protected boolean |
fStyleStripCDATADelims
Strip CDATA delimiters from STYLE tags. |
protected boolean |
fStyleStripCommentDelims
Strip comment delimiters from STYLE tags. |
static java.lang.String |
HTML_4_01_FRAMESET_PUBID
HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). |
static java.lang.String |
HTML_4_01_FRAMESET_SYSID
HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd"). |
static java.lang.String |
HTML_4_01_STRICT_PUBID
HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). |
static java.lang.String |
HTML_4_01_STRICT_SYSID
HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). |
static java.lang.String |
HTML_4_01_TRANSITIONAL_PUBID
HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN"). |
static java.lang.String |
HTML_4_01_TRANSITIONAL_SYSID
HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd"). |
static java.lang.String |
IGNORE_SPECIFIED_CHARSET
Ignore specified charset found in the <meta equiv='Content-Type' content='text/html;charset=…'> tag. |
static java.lang.String |
INSERT_DOCTYPE
Insert document type declaration. |
protected static java.lang.String |
NAMES_ATTRS
Modify HTML attribute names: { "upper", "lower", "default" }. |
protected static java.lang.String |
NAMES_ELEMS
Modify HTML element names: { "upper", "lower", "default" }. |
protected static short |
NAMES_LOWERCASE
Lowercase HTML names. |
protected static short |
NAMES_NO_CHANGE
Don't modify HTML names. |
protected static short |
NAMES_UPPERCASE
Uppercase HTML names. |
protected static java.lang.String |
NORMALIZE_ATTRIBUTES
Normalize attribute values. |
static java.lang.String |
NOTIFY_CHAR_REFS
Notify character entity references (e.g. |
static java.lang.String |
NOTIFY_HTML_BUILTIN_REFS
Notify handler of built-in entity references (e.g. |
static java.lang.String |
NOTIFY_XML_BUILTIN_REFS
Notify handler of built-in entity references (e.g. |
static java.lang.String |
OVERRIDE_DOCTYPE
Override doctype declaration public and system identifiers. |
protected static java.lang.String |
REPORT_ERRORS
Report errors. |
static java.lang.String |
SCRIPT_STRIP_CDATA_DELIMS
Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from SCRIPT tag contents. |
static java.lang.String |
SCRIPT_STRIP_COMMENT_DELIMS
Strip HTML comment delimiters ("<!−−" and "−−>") from SCRIPT tag contents. |
protected static short |
STATE_CONTENT
State: content. |
protected static short |
STATE_END_DOCUMENT
State: end document. |
protected static short |
STATE_MARKUP_BRACKET
State: markup bracket. |
protected static short |
STATE_START_DOCUMENT
State: start document. |
static java.lang.String |
STYLE_STRIP_CDATA_DELIMS
Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from STYLE tag contents. |
static java.lang.String |
STYLE_STRIP_COMMENT_DELIMS
Strip HTML comment delimiters ("<!−−" and "−−>") from STYLE tag contents. |
protected static HTMLEventInfo |
SYNTHESIZED_ITEM
Synthesized event info item. |
Constructor Summary | |
---|---|
HTMLScanner()
|
Method Summary | |
---|---|
protected static boolean |
builtinXmlRef(java.lang.String name)
Returns true if the name is a built-in XML general entity reference. |
void |
cleanup(boolean closeall)
Cleans up used resources. |
static java.lang.String |
expandSystemId(java.lang.String systemId,
java.lang.String baseSystemId)
Expands a system id and returns the system id as a URI, if it can be expanded. |
protected static java.lang.String |
fixURI(java.lang.String str)
Fixes a platform dependent filename to standard URI form. |
protected int |
fixWindowsCharacter(int origChar)
Fixes Microsoft Windows® specific characters. |
java.lang.String |
getBaseSystemId()
Returns the base system identifier. |
int |
getCharacterOffset()
Returns the character offset. |
int |
getColumnNumber()
Returns the current column number. |
org.apache.xerces.xni.XMLDocumentHandler |
getDocumentHandler()
Returns the document handler. |
java.lang.String |
getEncoding()
Returns the encoding. |
java.lang.String |
getExpandedSystemId()
Returns the expanded system identifier. |
java.lang.Boolean |
getFeatureDefault(java.lang.String featureId)
Returns the default state for a feature. |
int |
getLineNumber()
Returns the current line number. |
java.lang.String |
getLiteralSystemId()
Returns the literal system identifier. |
protected static short |
getNamesValue(java.lang.String value)
Converts HTML names string value to constant value. |
java.lang.Object |
getPropertyDefault(java.lang.String propertyId)
Returns the default state for a property. |
java.lang.String |
getPublicId()
Returns the public identifier. |
java.lang.String[] |
getRecognizedFeatures()
Returns recognized features. |
java.lang.String[] |
getRecognizedProperties()
Returns recognized properties. |
protected static java.lang.String |
getValue(org.apache.xerces.xni.XMLAttributes attrs,
java.lang.String aname)
Returns the value of the specified attribute, ignoring case. |
java.lang.String |
getXMLVersion()
Returns the XML version. |
protected int |
load(int offset)
Loads a new chunk of data into the buffer and returns the number of characters loaded or -1 if no additional characters were loaded. |
protected org.apache.xerces.xni.Augmentations |
locationAugs()
Returns an augmentations object with a location item added. |
protected static java.lang.String |
modifyName(java.lang.String name,
short mode)
Modifies the given name based on the specified mode. |
void |
pushInputSource(org.apache.xerces.xni.parser.XMLInputSource inputSource)
Pushes an input source onto the current entity stack. |
protected int |
read()
Reads a single character. |
void |
reset(org.apache.xerces.xni.parser.XMLComponentManager manager)
Resets the component. |
protected org.apache.xerces.xni.XMLResourceIdentifier |
resourceId()
Returns an empty resource identifier. |
protected void |
scanDoctype()
Scans a DOCTYPE line. |
boolean |
scanDocument(boolean complete)
Scans the document. |
protected int |
scanEntityRef(org.apache.xerces.util.XMLStringBuffer str,
boolean content)
Scans an entity reference. |
protected java.lang.String |
scanLiteral()
Scans a quoted literal. |
protected java.lang.String |
scanName()
Scans a name. |
void |
setDocumentHandler(org.apache.xerces.xni.XMLDocumentHandler handler)
Sets the document handler. |
void |
setFeature(java.lang.String featureId,
boolean state)
Sets a feature. |
void |
setInputSource(org.apache.xerces.xni.parser.XMLInputSource source)
Sets the input source. |
void |
setProperty(java.lang.String propertyId,
java.lang.Object value)
Sets a property. |
protected void |
setScanner(HTMLScanner.Scanner scanner)
Sets the scanner. |
protected void |
setScannerState(short state)
Sets the scanner state. |
protected boolean |
skip(java.lang.String s,
boolean caseSensitive)
Returns true if the specified text is present and is skipped. |
protected boolean |
skipMarkup(boolean balance)
Skips markup. |
protected int |
skipNewlines()
Skips newlines and returns the number of newlines skipped. |
protected int |
skipNewlines(int maxlines)
Skips newlines and returns the number of newlines skipped. |
protected boolean |
skipSpaces()
Skips whitespace. |
protected org.apache.xerces.xni.Augmentations |
synthesizedAugs()
Returns an augmentations object with a synthesized item added. |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
public static final java.lang.String HTML_4_01_STRICT_PUBID
public static final java.lang.String HTML_4_01_STRICT_SYSID
public static final java.lang.String HTML_4_01_TRANSITIONAL_PUBID
public static final java.lang.String HTML_4_01_TRANSITIONAL_SYSID
public static final java.lang.String HTML_4_01_FRAMESET_PUBID
public static final java.lang.String HTML_4_01_FRAMESET_SYSID
protected static final java.lang.String AUGMENTATIONS
protected static final java.lang.String REPORT_ERRORS
public static final java.lang.String NOTIFY_CHAR_REFS
public static final java.lang.String NOTIFY_XML_BUILTIN_REFS
Note: This only applies to the five pre-defined XML general entities. Specifically, "amp", "lt", "gt", "quot", and "apos". This is done for compatibility with the Xerces feature.
To be notified of the built-in entity references in HTML, set the
http://cyberneko.org/html/features/scanner/notify-builtin-refs
feature to true
.
public static final java.lang.String NOTIFY_HTML_BUILTIN_REFS
Note: This includes the five pre-defined XML general entities.
public static final java.lang.String FIX_MSWINDOWS_REFS
public static final java.lang.String SCRIPT_STRIP_COMMENT_DELIMS
public static final java.lang.String SCRIPT_STRIP_CDATA_DELIMS
public static final java.lang.String STYLE_STRIP_COMMENT_DELIMS
public static final java.lang.String STYLE_STRIP_CDATA_DELIMS
public static final java.lang.String IGNORE_SPECIFIED_CHARSET
public static final java.lang.String CDATA_SECTIONS
public static final java.lang.String OVERRIDE_DOCTYPE
public static final java.lang.String INSERT_DOCTYPE
protected static final java.lang.String NORMALIZE_ATTRIBUTES
protected static final java.lang.String NAMES_ELEMS
protected static final java.lang.String NAMES_ATTRS
protected static final java.lang.String DEFAULT_ENCODING
protected static final java.lang.String ERROR_REPORTER
protected static final java.lang.String DOCTYPE_PUBID
protected static final java.lang.String DOCTYPE_SYSID
protected static final short STATE_CONTENT
protected static final short STATE_MARKUP_BRACKET
protected static final short STATE_START_DOCUMENT
protected static final short STATE_END_DOCUMENT
protected static final short NAMES_NO_CHANGE
protected static final short NAMES_UPPERCASE
protected static final short NAMES_LOWERCASE
protected static final int DEFAULT_BUFFER_SIZE
protected static final boolean DEBUG_CALLBACKS
protected static java.lang.reflect.Method CHARSET_forName
protected static java.lang.reflect.Method DECODER_averageCharsPerByte
protected static final HTMLEventInfo SYNTHESIZED_ITEM
protected boolean fAugmentations
protected boolean fReportErrors
protected boolean fNotifyCharRefs
protected boolean fNotifyXmlBuiltinRefs
protected boolean fNotifyHtmlBuiltinRefs
protected boolean fFixWindowsCharRefs
protected boolean fScriptStripCDATADelims
protected boolean fScriptStripCommentDelims
protected boolean fStyleStripCDATADelims
protected boolean fStyleStripCommentDelims
protected boolean fIgnoreSpecifiedCharset
protected boolean fCDATASections
protected boolean fOverrideDoctype
protected boolean fInsertDoctype
protected boolean fNormalizeAttributes
protected short fNamesElems
protected short fNamesAttrs
protected java.lang.String fDefaultIANAEncoding
protected HTMLErrorReporter fErrorReporter
protected java.lang.String fDoctypePubid
protected java.lang.String fDoctypeSysid
protected int fBeginLineNumber
protected int fBeginColumnNumber
protected int fEndLineNumber
protected int fEndColumnNumber
protected HTMLScanner.PlaybackInputStream fByteStream
protected HTMLScanner.CurrentEntity fCurrentEntity
protected final java.util.Stack fCurrentEntityStack
protected HTMLScanner.Scanner fScanner
protected short fScannerState
protected org.apache.xerces.xni.XMLDocumentHandler fDocumentHandler
protected java.lang.String fIANAEncoding
protected java.lang.String fJavaEncoding
protected boolean fIso8859Encoding
protected int fElementCount
protected int fElementDepth
protected HTMLScanner.Scanner fContentScanner
protected HTMLScanner.SpecialScanner fSpecialScanner
protected final org.apache.xerces.xni.XMLString fString
protected final org.apache.xerces.util.XMLStringBuffer fStringBuffer
Constructor Detail |
---|
public HTMLScanner()
Method Detail |
---|
public void pushInputSource(org.apache.xerces.xni.parser.XMLInputSource inputSource)
Note: This functionality is experimental at this time and is subject to change in future releases of NekoHTML.
inputSource
- The new input source to start scanning.public void cleanup(boolean closeall)
closeall
- Close all streams, including the original.
This is used in cases when the application has
opened the original document stream and should
be responsible for closing it.public java.lang.String getEncoding()
getEncoding
in interface org.apache.xerces.xni.XMLLocator
public java.lang.String getPublicId()
getPublicId
in interface org.apache.xerces.xni.XMLLocator
public java.lang.String getBaseSystemId()
getBaseSystemId
in interface org.apache.xerces.xni.XMLLocator
public java.lang.String getLiteralSystemId()
getLiteralSystemId
in interface org.apache.xerces.xni.XMLLocator
public java.lang.String getExpandedSystemId()
getExpandedSystemId
in interface org.apache.xerces.xni.XMLLocator
public int getLineNumber()
getLineNumber
in interface org.apache.xerces.xni.XMLLocator
public int getColumnNumber()
getColumnNumber
in interface org.apache.xerces.xni.XMLLocator
public java.lang.String getXMLVersion()
getXMLVersion
in interface org.apache.xerces.xni.XMLLocator
public int getCharacterOffset()
getCharacterOffset
in interface org.apache.xerces.xni.XMLLocator
public java.lang.Boolean getFeatureDefault(java.lang.String featureId)
getFeatureDefault
in interface org.apache.xerces.xni.parser.XMLComponent
getFeatureDefault
in interface HTMLComponent
public java.lang.Object getPropertyDefault(java.lang.String propertyId)
getPropertyDefault
in interface org.apache.xerces.xni.parser.XMLComponent
getPropertyDefault
in interface HTMLComponent
public java.lang.String[] getRecognizedFeatures()
getRecognizedFeatures
in interface org.apache.xerces.xni.parser.XMLComponent
public java.lang.String[] getRecognizedProperties()
getRecognizedProperties
in interface org.apache.xerces.xni.parser.XMLComponent
public void reset(org.apache.xerces.xni.parser.XMLComponentManager manager) throws org.apache.xerces.xni.parser.XMLConfigurationException
reset
in interface org.apache.xerces.xni.parser.XMLComponent
org.apache.xerces.xni.parser.XMLConfigurationException
public void setFeature(java.lang.String featureId, boolean state) throws org.apache.xerces.xni.parser.XMLConfigurationException
setFeature
in interface org.apache.xerces.xni.parser.XMLComponent
org.apache.xerces.xni.parser.XMLConfigurationException
public void setProperty(java.lang.String propertyId, java.lang.Object value) throws org.apache.xerces.xni.parser.XMLConfigurationException
setProperty
in interface org.apache.xerces.xni.parser.XMLComponent
org.apache.xerces.xni.parser.XMLConfigurationException
public void setInputSource(org.apache.xerces.xni.parser.XMLInputSource source) throws java.io.IOException
setInputSource
in interface org.apache.xerces.xni.parser.XMLDocumentScanner
java.io.IOException
public boolean scanDocument(boolean complete) throws org.apache.xerces.xni.XNIException, java.io.IOException
scanDocument
in interface org.apache.xerces.xni.parser.XMLDocumentScanner
org.apache.xerces.xni.XNIException
java.io.IOException
public void setDocumentHandler(org.apache.xerces.xni.XMLDocumentHandler handler)
setDocumentHandler
in interface org.apache.xerces.xni.parser.XMLDocumentSource
public org.apache.xerces.xni.XMLDocumentHandler getDocumentHandler()
getDocumentHandler
in interface org.apache.xerces.xni.parser.XMLDocumentSource
protected static java.lang.String getValue(org.apache.xerces.xni.XMLAttributes attrs, java.lang.String aname)
public static java.lang.String expandSystemId(java.lang.String systemId, java.lang.String baseSystemId)
systemId
- The systemId to be expanded.
protected static java.lang.String fixURI(java.lang.String str)
str
- The string to fix.
protected static final java.lang.String modifyName(java.lang.String name, short mode)
protected static final short getNamesValue(java.lang.String value)
NAMES_NO_CHANGE
,
NAMES_LOWERCASE
,
NAMES_UPPERCASE
protected int fixWindowsCharacter(int origChar)
Details about this common problem can be found at http://www.cs.tut.fi/~jkorpela/www/windows-chars.html
protected int read() throws java.io.IOException
java.io.IOException
protected int load(int offset) throws java.io.IOException
offset
- The offset at which new characters should be loaded.
java.io.IOException
protected void setScanner(HTMLScanner.Scanner scanner)
protected void setScannerState(short state)
protected void scanDoctype() throws java.io.IOException
java.io.IOException
protected java.lang.String scanLiteral() throws java.io.IOException
java.io.IOException
protected java.lang.String scanName() throws java.io.IOException
java.io.IOException
protected int scanEntityRef(org.apache.xerces.util.XMLStringBuffer str, boolean content) throws java.io.IOException
java.io.IOException
protected boolean skip(java.lang.String s, boolean caseSensitive) throws java.io.IOException
java.io.IOException
protected boolean skipMarkup(boolean balance) throws java.io.IOException
java.io.IOException
protected boolean skipSpaces() throws java.io.IOException
java.io.IOException
protected int skipNewlines() throws java.io.IOException
java.io.IOException
protected int skipNewlines(int maxlines) throws java.io.IOException
java.io.IOException
protected final org.apache.xerces.xni.Augmentations locationAugs()
protected final org.apache.xerces.xni.Augmentations synthesizedAugs()
protected final org.apache.xerces.xni.XMLResourceIdentifier resourceId()
protected static boolean builtinXmlRef(java.lang.String name)
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |