This module implements a simple high performance XML / HTML parser. The only encoding that is supported is UTF-8. The parser has been designed to be somewhat error correcting, so that even most "wild HTML" found on the web can be parsed with it. Note: This parser does not check that each <tag> has a corresponding </tag>! These checks have do be implemented by the client code for various reasons:
<br> for example.The file examples/htmltitle.nim demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document.
# Example program to show the parsexml module
# This program reads an HTML file and writes its title to stdout.
# Errors and whitespace are ignored.
import os, streams, parsexml, strutils
if paramCount() < 1:
quit("Usage: htmltitle filename[.html]")
var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: XmlParser
open(x, s, filename)
while true:
x.next()
case x.kind
of xmlElementStart:
if cmpIgnoreCase(x.elementName, "title") == 0:
var title = ""
x.next() # skip "<title>"
while x.kind == xmlCharData:
title.add(x.charData)
x.next()
if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
echo("Title: " & title)
quit(0) # Success!
else:
echo(x.errorMsgExpected("/title"))
of xmlEof: break # end of file reached
else: discard # ignore other events
x.close()
quit("Could not determine title!") The file examples/htmlrefs.nim demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains.
# Example program to show the new parsexml module
# This program reads an HTML file and writes all its used links to stdout.
# Errors and whitespace are ignored.
import os, streams, parsexml, strutils
proc `=?=` (a, b: string): bool =
# little trick: define our own comparator that ignores case
return cmpIgnoreCase(a, b) == 0
if paramCount() < 1:
quit("Usage: htmlrefs filename[.html]")
var links = 0 # count the number of links
var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: XmlParser
open(x, s, filename)
next(x) # get first event
block mainLoop:
while true:
case x.kind
of xmlElementOpen:
# the <a href = "xyz"> tag we are interested in always has an attribute,
# thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
if x.elementName =?= "a":
x.next()
if x.kind == xmlAttribute:
if x.attrKey =?= "href":
var link = x.attrValue
inc(links)
# skip until we have an ``xmlElementClose`` event
while true:
x.next()
case x.kind
of xmlEof: break mainLoop
of xmlElementClose: break
else: discard
x.next() # skip ``xmlElementClose``
# now we have the description for the ``a`` element
var desc = ""
while x.kind == xmlCharData:
desc.add(x.charData)
x.next()
echo(desc & ": " & link)
else:
x.next()
of xmlEof: break # end of file reached
of xmlError:
echo(errorMsg(x))
x.next()
else: x.next() # skip other events
echo($links & " link(s) found!")
x.close() XmlEventKind = enum xmlError, ## an error occurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >``
XmlErrorKind = enum errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected
XmlParseOption = enum reportWhitespace, ## report whitespace reportComments ## report comments
XmlParser = object of BaseLexer a, b, c: string kind: XmlEventKind err: XmlErrorKind state: ParserState cIsEmpty: bool filename: string options: set[XmlParseOption]
proc open(my: var XmlParser; input: Stream; filename: string;
options: set[XmlParseOption] = {}) {...}{.raises: [Exception],
tags: [ReadIOEffect].}reportWhitespace a whitespace token is reported as an xmlWhitespace event. If options contains reportComments a comment token is reported as an xmlComment event. proc close(my: var XmlParser) {...}{.inline, raises: [Exception], tags: [].}proc kind(my: XmlParser): XmlEventKind {...}{.inline, raises: [], tags: [].}proc rawData(my: XmlParser): string {...}{.inline, raises: [], tags: [].}proc rawData2(my: XmlParser): string {...}{.inline, raises: [], tags: [].}proc getColumn(my: XmlParser): int {...}{.inline, raises: [], tags: [].}proc getLine(my: XmlParser): int {...}{.inline, raises: [], tags: [].}proc getFilename(my: XmlParser): string {...}{.inline, raises: [], tags: [].}proc errorMsg(my: XmlParser): string {...}{.raises: [ValueError], tags: [].}xmlError proc errorMsgExpected(my: XmlParser; tag: string): string {...}{.raises: [ValueError],
tags: [].}proc errorMsg(my: XmlParser; msg: string): string {...}{.raises: [ValueError], tags: [].}proc next(my: var XmlParser) {...}{.raises: [Exception], tags: [ReadIOEffect].}template charData(my: XmlParser): string
xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial Raises an assertion in debug mode if my.kind is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid. template elementName(my: XmlParser): string
xmlElementStart, xmlElementEnd, xmlElementOpen Raises an assertion in debug mode if my.kind is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid. template entityName(my: XmlParser): string
xmlEntity Raises an assertion in debug mode if my.kind is not xmlEntity. In release mode, this will not trigger an error but the value returned will not be valid. template attrKey(my: XmlParser): string
xmlAttribute Raises an assertion in debug mode if my.kind is not xmlAttribute. In release mode, this will not trigger an error but the value returned will not be valid. template attrValue(my: XmlParser): string
xmlAttribute Raises an assertion in debug mode if my.kind is not xmlAttribute. In release mode, this will not trigger an error but the value returned will not be valid. template piName(my: XmlParser): string
xmlPI Raises an assertion in debug mode if my.kind is not xmlPI. In release mode, this will not trigger an error but the value returned will not be valid. template piRest(my: XmlParser): string
xmlPI Raises an assertion in debug mode if my.kind is not xmlPI. In release mode, this will not trigger an error but the value returned will not be valid.
© 2006–2018 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/parsexml.html