HTMLparser

Name

HTMLparser -- 

Synopsis



typedef     htmlParserCtxt;
typedef     htmlParserCtxtPtr;
typedef     htmlParserNodeInfo;
typedef     htmlSAXHandler;
typedef     htmlSAXHandlerPtr;
typedef     htmlParserInput;
typedef     htmlParserInputPtr;
typedef     htmlDocPtr;
typedef     htmlNodePtr;
struct      htmlElemDesc;
typedef     htmlElemDescPtr;
struct      htmlEntityDesc;
typedef     htmlEntityDescPtr;
const htmlElemDesc* htmlTagLookup           (const xmlChar *tag);
const htmlEntityDesc* htmlEntityLookup      (const xmlChar *name);
const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);
int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);
int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);
const htmlEntityDesc* htmlParseEntityRef    (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);
int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);
void        htmlParseElement                (htmlParserCtxtPtr ctxt);
int         htmlParseDocument               (htmlParserCtxtPtr ctxt);
htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);
htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);
int         UTF8ToHtml                      (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);
int         htmlEncodeEntities              (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen,
                                             int quoteChar);
int         htmlIsScriptAttribute           (const xmlChar *name);
int         htmlHandleOmittedElem           (int val);
void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);
htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);
int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Description

Details

htmlParserCtxt


htmlParserCtxtPtr


htmlParserNodeInfo


htmlSAXHandler


htmlSAXHandlerPtr


htmlParserInput


htmlParserInputPtr


htmlDocPtr


htmlNodePtr


struct htmlElemDesc

struct htmlElemDesc {
    const char *name;	/* The tag name */
    char startTag;      /* Whether the start tag can be implied */
    char endTag;        /* Whether the end tag can be implied */
    char saveEndTag;    /* Whether the end tag should be saved */
    char empty;         /* Is this an empty element ? */
    char depr;          /* Is this a deprecated element ? */
    char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
    char isinline;      /* is this a block 0 or inline 1 element */
    const char *desc;   /* the description */

/* NRK Jan.2003
 * New fields encapsulating HTML structure
 *
 * Bugs:
 *	This is a very limited representation.  It fails to tell us when
 *	an element *requires* subelements (we only have whether they're
 *	allowed or not), and it doesn't tell us where CDATA and PCDATA
 *	are allowed.  Some element relationships are not fully represented:
 *	these are flagged with the word MODIFIER
 */
    const char** subelts;		/* allowed sub-elements of this element */
    const char* defaultsubelt;	/* subelement for suggested auto-repair
					   if necessary or NULL */
    const char** attrs_opt;		/* Optional Attributes */
    const char** attrs_depr;		/* Additional deprecated attributes */
    const char** attrs_req;		/* Required attributes */
};


htmlElemDescPtr


struct htmlEntityDesc

struct htmlEntityDesc {
    unsigned int value;	/* the UNICODE value for the character */
    const char *name;	/* The entity name */
    const char *desc;   /* the description */
};


htmlEntityDescPtr


htmlTagLookup ()

const htmlElemDesc* htmlTagLookup           (const xmlChar *tag);

Lookup the HTML tag in the ElementTable

tag : 
Returns : 


htmlEntityLookup ()

const htmlEntityDesc* htmlEntityLookup      (const xmlChar *name);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

name : 
Returns : 


htmlEntityValueLookup ()

const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

value : 
Returns : 


htmlIsAutoClosed ()

int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

doc : 
elem : 
Returns : 


htmlAutoCloseTag ()

int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

doc : 
name : 
elem : 
Returns : 


htmlParseEntityRef ()

const htmlEntityDesc* htmlParseEntityRef    (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);

parse an HTML ENTITY references

[68] EntityRef ::= '&' Name ';'

ctxt : 
str : 
Returns : 


htmlParseCharRef ()

int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);

parse Reference declarations

[66] CharRef ::= '&#' [0-9]+ ';' | '&x' [0-9a-fA-F]+ ';'

ctxt : 
Returns : 


htmlParseElement ()

void        htmlParseElement                (htmlParserCtxtPtr ctxt);

parse an HTML element, this is highly recursive

[39] element ::= EmptyElemTag | STag content ETag

[41] Attribute ::= Name Eq AttValue

ctxt : 


htmlParseDocument ()

int         htmlParseDocument               (htmlParserCtxtPtr ctxt);

parse an HTML document (and build a tree if using the standard SAX interface).

ctxt : 
Returns : 


htmlSAXParseDoc ()

htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.

cur : 
encoding : 
sax : 
userData : 
Returns : 


htmlParseDoc ()

htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);

parse an HTML in-memory document and build a tree.

cur : 
encoding : 
Returns : 


htmlSAXParseFile ()

htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

filename : 
encoding : 
sax : 
userData : 
Returns : 


htmlParseFile ()

htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

filename : 
encoding : 
Returns : 


UTF8ToHtml ()

int         UTF8ToHtml                      (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out : 
outlen : 
in : 
inlen : 
Returns : 


htmlEncodeEntities ()

int         htmlEncodeEntities              (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen,
                                             int quoteChar);

Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.

out : 
outlen : 
in : 
inlen : 
quoteChar : 
Returns : 


htmlIsScriptAttribute ()

int         htmlIsScriptAttribute           (const xmlChar *name);

Check if an attribute is of content type Script

name : 
Returns : 


htmlHandleOmittedElem ()

int         htmlHandleOmittedElem           (int val);

Set and return the previous value for handling HTML omitted tags.

val : 
Returns : 


htmlFreeParserCtxt ()

void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

ctxt : 


htmlCreatePushParserCtxt ()

htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);

Create a parser context for using the HTML parser in push mode The value of filename is used for fetching external entities and error/warning reports.

sax : 
user_data : 
chunk : 
size : 
filename : 
enc : 
Returns : 


htmlParseChunk ()

int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Parse a Chunk of memory

ctxt : 
chunk : 
size : 
terminate : 
Returns :