HTML Tag Parsing

冷暖自知 提交于 2019-12-17 15:35:43

问题


How can I parse Name: & Value text from within the tag with DIHtmlParser? I tried doing it with TCLHtmlParser from Clever Components but it failed. Second question is can DIHtmlParser parse individual tags for example loop through its sub tags. Its a total nightmare for such a simple problem.

<div class="tvRow tvFirst hasLabel tvFirst" title="example1">
  <label class="tvLabel">Name:</label>
  <span class="tvValue">Value</span>
<div class="clear"></div></div>

<div class="tvRow tvFirst hasLabel tvFirst" title="example2">
  <label class="tvLabel">Name:</label>
  <span class="tvValue">Value</span>
<div class="clear"></div></div>

回答1:


You could use IHTMLDocument2 DOM to parse whatever elements you need from the HTML:

uses ActiveX, MSHTML;

const
  HTML =
  '<div class="tvRow tvFirst hasLabel tvFirst" title="example1">' +
  '<label class="tvLabel">Name:</label>' +
  '<span class="tvValue">Value</span>' +
  '<div class="clear"></div>' +
  '</div>';

procedure TForm1.Button1Click(Sender: TObject);
var
  doc: OleVariant;
  el: OleVariant;
  i: Integer;
begin
  doc := coHTMLDocument.Create as IHTMLDocument2;
  doc.write(HTML);
  doc.close;
  ShowMessage(doc.body.innerHTML);
  for i := 0 to doc.body.all.length - 1 do
  begin
    el := doc.body.all.item(i);
    if (el.tagName = 'LABEL') and (el.className = 'tvLabel') then
      ShowMessage(el.innerText);
    if (el.tagName = 'SPAN') and (el.className = 'tvValue') then
      ShowMessage(el.innerText);
  end;
end;

I wanted to mention another very nice HTML parser I found today: htmlp (Delphi Dom HTML Parser and Converter). It's not as flexible as the IHTMLDocument2 obviously, but it's very easy to work with, fast, free, and supports Unicode for older Delphi versions.

Sample usage:

uses HtmlParser, DomCore;

function GetDocBody(HtmlDoc: TDocument): TElement;
var
  i: integer;
  node: TNode;
begin
  Result := nil;
  for i := 0 to HtmlDoc.documentElement.childNodes.length - 1 do
  begin
    node := HtmlDoc.documentElement.childNodes.item(i);
    if node.nodeName = 'body' then
    begin
      Result := node as TElement;
      Break;
    end;
  end;
end;

procedure THTMLForm.Button2Click(Sender: TObject);
var
  HtmlParser: THtmlParser;
  HtmlDoc: TDocument;
  i: Integer;
  body, el: TElement;
  node: TNode;
begin
  HtmlParser := THtmlParser.Create;
  try
    HtmlDoc := HtmlParser.parseString(HTML);
    try
      body := GetDocBody(HtmlDoc);
      if Assigned(body) then
        for i := 0 to body.childNodes.length - 1 do
        begin
          node := body.childNodes.item(i);
          if (node is TElement) then
          begin
            el := node as TElement;
            if (el.tagName = 'div') and (el.GetAttribute('class') = 'tvRow tvFirst hasLabel tvFirst') then
            begin
              // iterate el.childNodes here...
              ShowMessage(IntToStr(el.childNodes.length));
            end;
          end;
        end;
    finally
      HtmlDoc.Free;
    end;
  finally
    HtmlParser.Free
  end;
end;



回答2:


Use a HTML Parser to work on your html files.

Maybe DIHtmlParser will do the job.

RegEx is not a parser and converting from HTML to JSON is not a wise option.




回答3:


One can also use a combination of HTMLP parser with THtmlFormatter and OXml XPath parsing

uses
  // Htmlp
  HtmlParser,
  DomCore,
  Formatter,
  // OXml
  OXmlPDOM,
  OXmlUtils;

function HtmlToXHtml(const Html: string): string;
var
  HtmlParser: THtmlParser;
  HtmlDoc: TDocument;
  Formatter: THtmlFormatter;
begin
  HtmlParser := THtmlParser.Create;
  try
    HtmlDoc := HtmlParser.ParseString(Html);
    try
      Formatter := THtmlFormatter.Create;
      try
        Result := Formatter.GetText(HtmlDoc);
      finally
        Formatter.Free;
      end;
    finally
      HtmlDoc.Free;
    end;
  finally
    HtmlParser.Free;
  end;
end;

type
  TCard = record
    Store: string;
    Quality: string;
    Quantity: string;
    Price: string;
  end;
  TCards = array of TCard;

function ParseCard(const Node: PXMLNode): TCard;
const
  StoreXPath = 'div[1]/ax';
  QualityXPath = 'div[3]';
  QuantityXPath = 'div[4]';
  PriceXPath = 'div[5]';
var
  CurrentNode: PXMLNode;
begin
  Result := Default(TCard);
  if Node.SelectNode(StoreXPath, CurrentNode) then
     Result.Store := CurrentNode.Text;
  if Node.SelectNode(QualityXPath, CurrentNode) then
     Result.Quality := CurrentNode.Text;
  if Node.SelectNode(QuantityXPath, CurrentNode) then
     Result.Quantity := CurrentNode.Text;
  if Node.SelectNode(PriceXPath, CurrentNode) then
     Result.Price := CurrentNode.Text;
end;

procedure THTMLForm.OpenButtonClick(Sender: TObject);
var
  Html: string;
  Xml: string;
  FXmlDocument: IXMLDocument;
  QueryNode: PXMLNode;
  XPath: string;
  NodeList: IXMLNodeList;
  i: Integer;
  Card: TCard;
begin
  Html := System.IOUtils.TFile.ReadAllText(FileNameEdit.Text, TEncoding.UTF8);
  Xml := HtmlToXHtml(Html);
  Memo.Lines.Text := Xml;

  // Parse with XPath
  FXMLDocument := CreateXMLDoc;
  FXMLDocument.WriterSettings.IndentType := itIndent;
  if not FXMLDocument.LoadFromXML(Xml) then
    raise Exception.Create('Source document is not valid');
  QueryNode := FXmlDocument.DocumentElement;
  XPath := '//div[@class="row pricetableline"]';
  NodeList := QueryNode.SelectNodes(XPath);
  for i := 0 to NodeList.Count -1 do
  begin
    Card := ParseCard(NodeList[i]);
    Memo.Lines.Text := Memo.Lines.Text + sLineBreak +
      Format('%0:s %1:s %2:s %3:s', [Card.Store, Card.Quality, Card.Quantity, Card.Price]);
  end;

  Memo.SelStart := 0;
  Memo.SelLength := 0;
end;


来源:https://stackoverflow.com/questions/14348346/html-tag-parsing

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!