HTML Tag Parsing

前端 未结 3 2053
日久生厌
日久生厌 2020-12-05 01:02

How can I parse Name: & Value text from within the tag with DIHtmlParser? I tried doing it with TCLHtmlParser from Clever Components but it failed. Seco

3条回答
  •  借酒劲吻你
    2020-12-05 01:59

    One can also use a combination of HTMLP parser with THtmlFormatter and OXml XPath parsing

    uses
      // Htmlp
      HtmlParser,
      DomCore,
      Formatter,
      // OXml
      OXmlPDOM,
      OXmlUtils;
    
    function HtmlToXHtml(const Html: string): string;
    var
      HtmlParser: THtmlParser;
      HtmlDoc: TDocument;
      Formatter: THtmlFormatter;
    begin
      HtmlParser := THtmlParser.Create;
      try
        HtmlDoc := HtmlParser.ParseString(Html);
        try
          Formatter := THtmlFormatter.Create;
          try
            Result := Formatter.GetText(HtmlDoc);
          finally
            Formatter.Free;
          end;
        finally
          HtmlDoc.Free;
        end;
      finally
        HtmlParser.Free;
      end;
    end;
    
    type
      TCard = record
        Store: string;
        Quality: string;
        Quantity: string;
        Price: string;
      end;
      TCards = array of TCard;
    
    function ParseCard(const Node: PXMLNode): TCard;
    const
      StoreXPath = 'div[1]/ax';
      QualityXPath = 'div[3]';
      QuantityXPath = 'div[4]';
      PriceXPath = 'div[5]';
    var
      CurrentNode: PXMLNode;
    begin
      Result := Default(TCard);
      if Node.SelectNode(StoreXPath, CurrentNode) then
         Result.Store := CurrentNode.Text;
      if Node.SelectNode(QualityXPath, CurrentNode) then
         Result.Quality := CurrentNode.Text;
      if Node.SelectNode(QuantityXPath, CurrentNode) then
         Result.Quantity := CurrentNode.Text;
      if Node.SelectNode(PriceXPath, CurrentNode) then
         Result.Price := CurrentNode.Text;
    end;
    
    procedure THTMLForm.OpenButtonClick(Sender: TObject);
    var
      Html: string;
      Xml: string;
      FXmlDocument: IXMLDocument;
      QueryNode: PXMLNode;
      XPath: string;
      NodeList: IXMLNodeList;
      i: Integer;
      Card: TCard;
    begin
      Html := System.IOUtils.TFile.ReadAllText(FileNameEdit.Text, TEncoding.UTF8);
      Xml := HtmlToXHtml(Html);
      Memo.Lines.Text := Xml;
    
      // Parse with XPath
      FXMLDocument := CreateXMLDoc;
      FXMLDocument.WriterSettings.IndentType := itIndent;
      if not FXMLDocument.LoadFromXML(Xml) then
        raise Exception.Create('Source document is not valid');
      QueryNode := FXmlDocument.DocumentElement;
      XPath := '//div[@class="row pricetableline"]';
      NodeList := QueryNode.SelectNodes(XPath);
      for i := 0 to NodeList.Count -1 do
      begin
        Card := ParseCard(NodeList[i]);
        Memo.Lines.Text := Memo.Lines.Text + sLineBreak +
          Format('%0:s %1:s %2:s %3:s', [Card.Store, Card.Quality, Card.Quantity, Card.Price]);
      end;
    
      Memo.SelStart := 0;
      Memo.SelLength := 0;
    end;
    

提交回复
热议问题