HTML Tag Parsing | 易学教程

问题

How can I parse Name: & Value text from within the tag with DIHtmlParser? I tried doing it with TCLHtmlParser from Clever Components but it failed. Second question is can DIHtmlParser parse individual tags for example loop through its sub tags. Its a total nightmare for such a simple problem.

<div class="tvRow tvFirst hasLabel tvFirst" title="example1">
  <label class="tvLabel">Name:</label>
  <span class="tvValue">Value</span>
<div class="clear"></div></div>

<div class="tvRow tvFirst hasLabel tvFirst" title="example2">
  <label class="tvLabel">Name:</label>
  <span class="tvValue">Value</span>
<div class="clear"></div></div>

回答1:

You could use IHTMLDocument2 DOM to parse whatever elements you need from the HTML:

uses ActiveX, MSHTML;

const
  HTML =
  '<div class="tvRow tvFirst hasLabel tvFirst" title="example1">' +
  '<label class="tvLabel">Name:</label>' +
  '<span class="tvValue">Value</span>' +
  '<div class="clear"></div>' +
  '</div>';

procedure TForm1.Button1Click(Sender: TObject);
var
  doc: OleVariant;
  el: OleVariant;
  i: Integer;
begin
  doc := coHTMLDocument.Create as IHTMLDocument2;
  doc.write(HTML);
  doc.close;
  ShowMessage(doc.body.innerHTML);
  for i := 0 to doc.body.all.length - 1 do
  begin
    el := doc.body.all.item(i);
    if (el.tagName = 'LABEL') and (el.className = 'tvLabel') then
      ShowMessage(el.innerText);
    if (el.tagName = 'SPAN') and (el.className = 'tvValue') then
      ShowMessage(el.innerText);
  end;
end;

I wanted to mention another very nice HTML parser I found today: htmlp (Delphi Dom HTML Parser and Converter). It's not as flexible as the IHTMLDocument2 obviously, but it's very easy to work with, fast, free, and supports Unicode for older Delphi versions.

Sample usage:

uses HtmlParser, DomCore;

function GetDocBody(HtmlDoc: TDocument): TElement;
var
  i: integer;
  node: TNode;
begin
  Result := nil;
  for i := 0 to HtmlDoc.documentElement.childNodes.length - 1 do
  begin
    node := HtmlDoc.documentElement.childNodes.item(i);
    if node.nodeName = 'body' then
    begin
      Result := node as TElement;
      Break;
    end;
  end;
end;

procedure THTMLForm.Button2Click(Sender: TObject);
var
  HtmlParser: THtmlParser;
  HtmlDoc: TDocument;
  i: Integer;
  body, el: TElement;
  node: TNode;
begin
  HtmlParser := THtmlParser.Create;
  try
    HtmlDoc := HtmlParser.parseString(HTML);
    try
      body := GetDocBody(HtmlDoc);
      if Assigned(body) then
        for i := 0 to body.childNodes.length - 1 do
        begin
          node := body.childNodes.item(i);
          if (node is TElement) then
          begin
            el := node as TElement;
            if (el.tagName = 'div') and (el.GetAttribute('class') = 'tvRow tvFirst hasLabel tvFirst') then
            begin
              // iterate el.childNodes here...
              ShowMessage(IntToStr(el.childNodes.length));
            end;
          end;
        end;
    finally
      HtmlDoc.Free;
    end;
  finally
    HtmlParser.Free
  end;
end;

回答2:

Use a HTML Parser to work on your html files.

Maybe DIHtmlParser will do the job.

RegEx is not a parser and converting from HTML to JSON is not a wise option.

回答3:

One can also use a combination of HTMLP parser with THtmlFormatter and OXml XPath parsing

uses
  // Htmlp
  HtmlParser,
  DomCore,
  Formatter,
  // OXml
  OXmlPDOM,
  OXmlUtils;

function HtmlToXHtml(const Html: string): string;
var
  HtmlParser: THtmlParser;
  HtmlDoc: TDocument;
  Formatter: THtmlFormatter;
begin
  HtmlParser := THtmlParser.Create;
  try
    HtmlDoc := HtmlParser.ParseString(Html);
    try
      Formatter := THtmlFormatter.Create;
      try
        Result := Formatter.GetText(HtmlDoc);
      finally
        Formatter.Free;
      end;
    finally
      HtmlDoc.Free;
    end;
  finally
    HtmlParser.Free;
  end;
end;

type
  TCard = record
    Store: string;
    Quality: string;
    Quantity: string;
    Price: string;
  end;
  TCards = array of TCard;

function ParseCard(const Node: PXMLNode): TCard;
const
  StoreXPath = 'div[1]/ax';
  QualityXPath = 'div[3]';
  QuantityXPath = 'div[4]';
  PriceXPath = 'div[5]';
var
  CurrentNode: PXMLNode;
begin
  Result := Default(TCard);
  if Node.SelectNode(StoreXPath, CurrentNode) then
     Result.Store := CurrentNode.Text;
  if Node.SelectNode(QualityXPath, CurrentNode) then
     Result.Quality := CurrentNode.Text;
  if Node.SelectNode(QuantityXPath, CurrentNode) then
     Result.Quantity := CurrentNode.Text;
  if Node.SelectNode(PriceXPath, CurrentNode) then
     Result.Price := CurrentNode.Text;
end;

procedure THTMLForm.OpenButtonClick(Sender: TObject);
var
  Html: string;
  Xml: string;
  FXmlDocument: IXMLDocument;
  QueryNode: PXMLNode;
  XPath: string;
  NodeList: IXMLNodeList;
  i: Integer;
  Card: TCard;
begin
  Html := System.IOUtils.TFile.ReadAllText(FileNameEdit.Text, TEncoding.UTF8);
  Xml := HtmlToXHtml(Html);
  Memo.Lines.Text := Xml;

  // Parse with XPath
  FXMLDocument := CreateXMLDoc;
  FXMLDocument.WriterSettings.IndentType := itIndent;
  if not FXMLDocument.LoadFromXML(Xml) then
    raise Exception.Create('Source document is not valid');
  QueryNode := FXmlDocument.DocumentElement;
  XPath := '//div[@class="row pricetableline"]';
  NodeList := QueryNode.SelectNodes(XPath);
  for i := 0 to NodeList.Count -1 do
  begin
    Card := ParseCard(NodeList[i]);
    Memo.Lines.Text := Memo.Lines.Text + sLineBreak +
      Format('%0:s %1:s %2:s %3:s', [Card.Store, Card.Quality, Card.Quantity, Card.Price]);
  end;

  Memo.SelStart := 0;
  Memo.SelLength := 0;
end;

来源：https://stackoverflow.com/questions/14348346/html-tag-parsing

标签

html

Delphi

delphi-xe2