parsexml

This module implements a simple high performance XML / HTML parser. これは UTF-8 エンコーディング専用です。The parser has been designed to be somewhat error correcting, so that even most "wild HTML" found on the web can be parsed with it. Note: This parser does not check that each <tag> has a corresponding </tag>!These checks have do be implemented by the client code for various reasons:

  • Old HTML contains tags that have no end tag: <br> for example.
  • HTML tags are case insensitive, XML tags are case sensitive. Since this library can parse both, only the client knows which comparison is to be used.
  • Thus the checks would have been very difficult to implement properly with little benefit, especially since they are simple to implement in the client. The client should use the errorMsgExpected proc to generate a nice error message that fits the other error messages this library creates.

Example 1: Retrieve HTML title

The file examples/htmltitle.nim demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document.

# Example program to show the parsexml module
# This program reads an HTML file and writes its title to stdout.
# Errors and whitespace are ignored.

import os, streams, parsexml, strutils

if paramCount() < 1:
  quit("Usage: htmltitle filename[.html]")

var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: XmlParser
open(x, s, filename)
while true:
  x.next()
  case x.kind
  of xmlElementStart:
    if cmpIgnoreCase(x.elementName, "title") == 0:
      var title = ""
      x.next()  # skip "<title>"
      while x.kind == xmlCharData:
        title.add(x.charData)
        x.next()
      if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
        echo("Title: " & title)
        quit(0) # Success!
      else:
        echo(x.errorMsgExpected("/title"))
  
  of xmlEof: break # end of file reached
  else: discard # ignore other events

x.close()
quit("Could not determine title!")

Example 2: Retrieve all HTML links

The file examples/htmlrefs.nim demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains.

# Example program to show the new parsexml module
# This program reads an HTML file and writes all its used links to stdout.
# Errors and whitespace are ignored.

import os, streams, parsexml, strutils

proc `=?=` (a, b: string): bool =
  # little trick: define our own comparator that ignores case
  return cmpIgnoreCase(a, b) == 0

if paramCount() < 1:
  quit("Usage: htmlrefs filename[.html]")

var links = 0 # count the number of links
var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: XmlParser
open(x, s, filename)
next(x) # get first event
block mainLoop:
  while true:
    case x.kind
    of xmlElementOpen:
      # the <a href = "xyz"> tag we are interested in always has an attribute,
      # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
      if x.elementName =?= "a":
        x.next()
        if x.kind == xmlAttribute:
          if x.attrKey =?= "href":
            var link = x.attrValue
            inc(links)
            # skip until we have an ``xmlElementClose`` event
            while true:
              x.next()
              case x.kind
              of xmlEof: break mainLoop
              of xmlElementClose: break
              else: discard
            x.next() # skip ``xmlElementClose``
            # now we have the description for the ``a`` element
            var desc = ""
            while x.kind == xmlCharData:
              desc.add(x.charData)
              x.next()
            echo(desc & ": " & link)
      else:
        x.next()
    of xmlEof: break # end of file reached
    of xmlError:
      echo(errorMsg(x))
      x.next()
    else: x.next() # skip other events

echo($links & " link(s) found!")
x.close()

XmlEventKind = enum
  xmlError,                   ## an error occurred during parsing
  xmlEof,                     ## end of file reached
  xmlCharData,                ## character data
  xmlWhitespace,              ## whitespace has been parsed
  xmlComment,                 ## a comment has been parsed
  xmlPI,                      ## processing instruction (````)
  xmlElementStart,            ## ``<elem>``
  xmlElementEnd,              ## ``</elem>``
  xmlElementOpen,             ## ``<elem
  xmlAttribute,               ## ``key = "value"`` pair
  xmlElementClose,            ## ``>``
  xmlCData,                   ## ``<![CDATA[`` ... data ... ``]]>``
  xmlEntity,                  ## &entity;
  xmlSpecial                  ## ``<! ... data ... >``
構文解析時に発生するイベントの列挙型   ソース 編集
XmlErrorKind = enum
  errNone,                    ## エラーなし
  errEndOfCDataExpected,      ## ``]]>`` expected
  errNameExpected,            ## name expected
  errSemicolonExpected,       ## ``;`` expected
  errQmGtExpected,            ## ``?>`` expected
  errGtExpected,              ## ``>`` expected
  errEqExpected,              ## ``=`` expected
  errQuoteExpected,           ## ``"`` または ``'`` を検出した
  errEndOfCommentExpected,    ## ``-->`` expected
  errAttributeValueExpected   ## non-empty attribute value expected
発生するエラーをリスト化した列挙型   ソース 編集
XmlParseOption = enum
  reportWhitespace,           ## report whitespace
  reportComments,             ## report comments
  allowUnquotedAttribs,       ## allow unquoted attribute values (for HTML)
  allowEmptyAttribs           ## allow empty attributes (without explicit value)
options for the XML parser   ソース 編集
XmlParser = object of BaseLexer
  a, b, c: string
  kind: XmlEventKind
  err: XmlErrorKind
  state: ParserState
  cIsEmpty: bool
  filename: string
  options: set[XmlParseOption]
パーサーのオブジェクトです。  ソース 編集

プロシージャ

proc open(my: var XmlParser; input: Stream; filename: string;
         options: set[XmlParseOption] = {}) {...}{.raises: [Defect, IOError, OSError],
    tags: [ReadIOEffect].}
入力ストリームでパーサーを初期化します。Filename はわかりやすいエラーメッセージの出力で使われます。The parser's behaviour can be controlled by the options parameter: If options contains reportWhitespace a whitespace token is reported as an xmlWhitespace event. If options contains reportComments a comment token is reported as an xmlComment event.   ソース 編集
proc close(my: var XmlParser) {...}{.inline, raises: [Exception, IOError, OSError],
                            tags: [WriteIOEffect].}
パーサー my と関連する入力ストリームを閉じます  ソース 編集
proc kind(my: XmlParser): XmlEventKind {...}{.inline, raises: [], tags: [].}
returns the current event type for the XML parser   ソース 編集
proc rawData(my: var XmlParser): string {...}{.inline, raises: [], tags: [].}
returns the underlying 'data' string by reference. これはスピードハックに限り使います。  ソース 編集
proc rawData2(my: var XmlParser): string {...}{.inline, raises: [], tags: [].}
returns the underlying second 'data' string by reference. これはスピードハックに限り使います。  ソース 編集
proc getColumn(my: XmlParser): int {...}{.inline, raises: [], tags: [].}
パーサーが到達した現在の桁を取得します。  ソース 編集
proc getLine(my: XmlParser): int {...}{.inline, raises: [], tags: [].}
パーサーが到達した現在の行を取得します。  ソース 編集
proc getFilename(my: XmlParser): string {...}{.inline, raises: [], tags: [].}
パーサーで処理するファイルの名前を取得します。  ソース 編集
proc errorMsg(my: XmlParser): string {...}{.raises: [ValueError], tags: [].}
returns a helpful error message for the event xmlError   ソース 編集
proc errorMsgExpected(my: XmlParser; tag: string): string {...}{.raises: [ValueError],
    tags: [].}
returns an error message "<tag> expected" in the same format as the other error messages   ソース 編集
proc errorMsg(my: XmlParser; msg: string): string {...}{.raises: [ValueError], tags: [].}
returns an error message with text msg in the same format as the other error messages   ソース 編集
proc next(my: var XmlParser) {...}{.raises: [Defect, IOError, OSError], tags: [ReadIOEffect].}
first/next イベントを取得します。これはパーサーを制御します。  ソース 編集

テンプレート

template charData(my: XmlParser): string
returns the character data for the events: xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial Raises an assertion in debug mode if my.kind is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集
template elementName(my: XmlParser): string
returns the element name for the events: xmlElementStart, xmlElementEnd, xmlElementOpen Raises an assertion in debug mode if my.kind is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集
template entityName(my: XmlParser): string
returns the entity name for the event: xmlEntity Raises an assertion in debug mode if my.kind is not xmlEntity. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集
template attrKey(my: XmlParser): string
returns the attribute key for the event xmlAttribute Raises an assertion in debug mode if my.kind is not xmlAttribute. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集
template attrValue(my: XmlParser): string
returns the attribute value for the event xmlAttribute Raises an assertion in debug mode if my.kind is not xmlAttribute. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集
template piName(my: XmlParser): string
returns the processing instruction name for the event xmlPI Raises an assertion in debug mode if my.kind is not xmlPI. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集
template piRest(my: XmlParser): string
returns the rest of the processing instruction for the event xmlPI Raises an assertion in debug mode if my.kind is not xmlPI. In release mode, this will not trigger an error but the value returned will not be valid.   ソース 編集