How can I parse Name: & Value text from within the tag with DIHtmlParser? I tried doing it with TCLHtmlParser from Clever Components but it failed. Seco
Use a HTML Parser to work on your html files.
Maybe DIHtmlParser will do the job.
RegEx is not a parser and converting from HTML to JSON is not a wise option.
You could use IHTMLDocument2 DOM to parse whatever elements you need from the HTML:
uses ActiveX, MSHTML;
const
HTML =
'<div class="tvRow tvFirst hasLabel tvFirst" title="example1">' +
'<label class="tvLabel">Name:</label>' +
'<span class="tvValue">Value</span>' +
'<div class="clear"></div>' +
'</div>';
procedure TForm1.Button1Click(Sender: TObject);
var
doc: OleVariant;
el: OleVariant;
i: Integer;
begin
doc := coHTMLDocument.Create as IHTMLDocument2;
doc.write(HTML);
doc.close;
ShowMessage(doc.body.innerHTML);
for i := 0 to doc.body.all.length - 1 do
begin
el := doc.body.all.item(i);
if (el.tagName = 'LABEL') and (el.className = 'tvLabel') then
ShowMessage(el.innerText);
if (el.tagName = 'SPAN') and (el.className = 'tvValue') then
ShowMessage(el.innerText);
end;
end;
I wanted to mention another very nice HTML parser I found today: htmlp (Delphi Dom HTML Parser and Converter). It's not as flexible as the IHTMLDocument2
obviously, but it's very easy to work with, fast, free, and supports Unicode for older Delphi versions.
Sample usage:
uses HtmlParser, DomCore;
function GetDocBody(HtmlDoc: TDocument): TElement;
var
i: integer;
node: TNode;
begin
Result := nil;
for i := 0 to HtmlDoc.documentElement.childNodes.length - 1 do
begin
node := HtmlDoc.documentElement.childNodes.item(i);
if node.nodeName = 'body' then
begin
Result := node as TElement;
Break;
end;
end;
end;
procedure THTMLForm.Button2Click(Sender: TObject);
var
HtmlParser: THtmlParser;
HtmlDoc: TDocument;
i: Integer;
body, el: TElement;
node: TNode;
begin
HtmlParser := THtmlParser.Create;
try
HtmlDoc := HtmlParser.parseString(HTML);
try
body := GetDocBody(HtmlDoc);
if Assigned(body) then
for i := 0 to body.childNodes.length - 1 do
begin
node := body.childNodes.item(i);
if (node is TElement) then
begin
el := node as TElement;
if (el.tagName = 'div') and (el.GetAttribute('class') = 'tvRow tvFirst hasLabel tvFirst') then
begin
// iterate el.childNodes here...
ShowMessage(IntToStr(el.childNodes.length));
end;
end;
end;
finally
HtmlDoc.Free;
end;
finally
HtmlParser.Free
end;
end;
One can also use a combination of HTMLP parser with THtmlFormatter and OXml XPath parsing
uses
// Htmlp
HtmlParser,
DomCore,
Formatter,
// OXml
OXmlPDOM,
OXmlUtils;
function HtmlToXHtml(const Html: string): string;
var
HtmlParser: THtmlParser;
HtmlDoc: TDocument;
Formatter: THtmlFormatter;
begin
HtmlParser := THtmlParser.Create;
try
HtmlDoc := HtmlParser.ParseString(Html);
try
Formatter := THtmlFormatter.Create;
try
Result := Formatter.GetText(HtmlDoc);
finally
Formatter.Free;
end;
finally
HtmlDoc.Free;
end;
finally
HtmlParser.Free;
end;
end;
type
TCard = record
Store: string;
Quality: string;
Quantity: string;
Price: string;
end;
TCards = array of TCard;
function ParseCard(const Node: PXMLNode): TCard;
const
StoreXPath = 'div[1]/ax';
QualityXPath = 'div[3]';
QuantityXPath = 'div[4]';
PriceXPath = 'div[5]';
var
CurrentNode: PXMLNode;
begin
Result := Default(TCard);
if Node.SelectNode(StoreXPath, CurrentNode) then
Result.Store := CurrentNode.Text;
if Node.SelectNode(QualityXPath, CurrentNode) then
Result.Quality := CurrentNode.Text;
if Node.SelectNode(QuantityXPath, CurrentNode) then
Result.Quantity := CurrentNode.Text;
if Node.SelectNode(PriceXPath, CurrentNode) then
Result.Price := CurrentNode.Text;
end;
procedure THTMLForm.OpenButtonClick(Sender: TObject);
var
Html: string;
Xml: string;
FXmlDocument: IXMLDocument;
QueryNode: PXMLNode;
XPath: string;
NodeList: IXMLNodeList;
i: Integer;
Card: TCard;
begin
Html := System.IOUtils.TFile.ReadAllText(FileNameEdit.Text, TEncoding.UTF8);
Xml := HtmlToXHtml(Html);
Memo.Lines.Text := Xml;
// Parse with XPath
FXMLDocument := CreateXMLDoc;
FXMLDocument.WriterSettings.IndentType := itIndent;
if not FXMLDocument.LoadFromXML(Xml) then
raise Exception.Create('Source document is not valid');
QueryNode := FXmlDocument.DocumentElement;
XPath := '//div[@class="row pricetableline"]';
NodeList := QueryNode.SelectNodes(XPath);
for i := 0 to NodeList.Count -1 do
begin
Card := ParseCard(NodeList[i]);
Memo.Lines.Text := Memo.Lines.Text + sLineBreak +
Format('%0:s %1:s %2:s %3:s', [Card.Store, Card.Quality, Card.Quantity, Card.Price]);
end;
Memo.SelStart := 0;
Memo.SelLength := 0;
end;