HTML Tag Parsing

前端 未结 3 2046
日久生厌
日久生厌 2020-12-05 01:02

How can I parse Name: & Value text from within the tag with DIHtmlParser? I tried doing it with TCLHtmlParser from Clever Components but it failed. Seco

相关标签:
3条回答
  • 2020-12-05 01:40

    Use a HTML Parser to work on your html files.

    Maybe DIHtmlParser will do the job.

    RegEx is not a parser and converting from HTML to JSON is not a wise option.

    0 讨论(0)
  • 2020-12-05 01:47

    You could use IHTMLDocument2 DOM to parse whatever elements you need from the HTML:

    uses ActiveX, MSHTML;
    
    const
      HTML =
      '<div class="tvRow tvFirst hasLabel tvFirst" title="example1">' +
      '<label class="tvLabel">Name:</label>' +
      '<span class="tvValue">Value</span>' +
      '<div class="clear"></div>' +
      '</div>';
    
    procedure TForm1.Button1Click(Sender: TObject);
    var
      doc: OleVariant;
      el: OleVariant;
      i: Integer;
    begin
      doc := coHTMLDocument.Create as IHTMLDocument2;
      doc.write(HTML);
      doc.close;
      ShowMessage(doc.body.innerHTML);
      for i := 0 to doc.body.all.length - 1 do
      begin
        el := doc.body.all.item(i);
        if (el.tagName = 'LABEL') and (el.className = 'tvLabel') then
          ShowMessage(el.innerText);
        if (el.tagName = 'SPAN') and (el.className = 'tvValue') then
          ShowMessage(el.innerText);
      end;
    end;
    

    I wanted to mention another very nice HTML parser I found today: htmlp (Delphi Dom HTML Parser and Converter). It's not as flexible as the IHTMLDocument2 obviously, but it's very easy to work with, fast, free, and supports Unicode for older Delphi versions.

    Sample usage:

    uses HtmlParser, DomCore;
    
    function GetDocBody(HtmlDoc: TDocument): TElement;
    var
      i: integer;
      node: TNode;
    begin
      Result := nil;
      for i := 0 to HtmlDoc.documentElement.childNodes.length - 1 do
      begin
        node := HtmlDoc.documentElement.childNodes.item(i);
        if node.nodeName = 'body' then
        begin
          Result := node as TElement;
          Break;
        end;
      end;
    end;
    
    procedure THTMLForm.Button2Click(Sender: TObject);
    var
      HtmlParser: THtmlParser;
      HtmlDoc: TDocument;
      i: Integer;
      body, el: TElement;
      node: TNode;
    begin
      HtmlParser := THtmlParser.Create;
      try
        HtmlDoc := HtmlParser.parseString(HTML);
        try
          body := GetDocBody(HtmlDoc);
          if Assigned(body) then
            for i := 0 to body.childNodes.length - 1 do
            begin
              node := body.childNodes.item(i);
              if (node is TElement) then
              begin
                el := node as TElement;
                if (el.tagName = 'div') and (el.GetAttribute('class') = 'tvRow tvFirst hasLabel tvFirst') then
                begin
                  // iterate el.childNodes here...
                  ShowMessage(IntToStr(el.childNodes.length));
                end;
              end;
            end;
        finally
          HtmlDoc.Free;
        end;
      finally
        HtmlParser.Free
      end;
    end;
    
    0 讨论(0)
  • 2020-12-05 01:59

    One can also use a combination of HTMLP parser with THtmlFormatter and OXml XPath parsing

    uses
      // Htmlp
      HtmlParser,
      DomCore,
      Formatter,
      // OXml
      OXmlPDOM,
      OXmlUtils;
    
    function HtmlToXHtml(const Html: string): string;
    var
      HtmlParser: THtmlParser;
      HtmlDoc: TDocument;
      Formatter: THtmlFormatter;
    begin
      HtmlParser := THtmlParser.Create;
      try
        HtmlDoc := HtmlParser.ParseString(Html);
        try
          Formatter := THtmlFormatter.Create;
          try
            Result := Formatter.GetText(HtmlDoc);
          finally
            Formatter.Free;
          end;
        finally
          HtmlDoc.Free;
        end;
      finally
        HtmlParser.Free;
      end;
    end;
    
    type
      TCard = record
        Store: string;
        Quality: string;
        Quantity: string;
        Price: string;
      end;
      TCards = array of TCard;
    
    function ParseCard(const Node: PXMLNode): TCard;
    const
      StoreXPath = 'div[1]/ax';
      QualityXPath = 'div[3]';
      QuantityXPath = 'div[4]';
      PriceXPath = 'div[5]';
    var
      CurrentNode: PXMLNode;
    begin
      Result := Default(TCard);
      if Node.SelectNode(StoreXPath, CurrentNode) then
         Result.Store := CurrentNode.Text;
      if Node.SelectNode(QualityXPath, CurrentNode) then
         Result.Quality := CurrentNode.Text;
      if Node.SelectNode(QuantityXPath, CurrentNode) then
         Result.Quantity := CurrentNode.Text;
      if Node.SelectNode(PriceXPath, CurrentNode) then
         Result.Price := CurrentNode.Text;
    end;
    
    procedure THTMLForm.OpenButtonClick(Sender: TObject);
    var
      Html: string;
      Xml: string;
      FXmlDocument: IXMLDocument;
      QueryNode: PXMLNode;
      XPath: string;
      NodeList: IXMLNodeList;
      i: Integer;
      Card: TCard;
    begin
      Html := System.IOUtils.TFile.ReadAllText(FileNameEdit.Text, TEncoding.UTF8);
      Xml := HtmlToXHtml(Html);
      Memo.Lines.Text := Xml;
    
      // Parse with XPath
      FXMLDocument := CreateXMLDoc;
      FXMLDocument.WriterSettings.IndentType := itIndent;
      if not FXMLDocument.LoadFromXML(Xml) then
        raise Exception.Create('Source document is not valid');
      QueryNode := FXmlDocument.DocumentElement;
      XPath := '//div[@class="row pricetableline"]';
      NodeList := QueryNode.SelectNodes(XPath);
      for i := 0 to NodeList.Count -1 do
      begin
        Card := ParseCard(NodeList[i]);
        Memo.Lines.Text := Memo.Lines.Text + sLineBreak +
          Format('%0:s %1:s %2:s %3:s', [Card.Store, Card.Quality, Card.Quantity, Card.Price]);
      end;
    
      Memo.SelStart := 0;
      Memo.SelLength := 0;
    end;
    
    0 讨论(0)
提交回复
热议问题