I want to parse the html table using html agility pack. I want to extract only some predefined column data from the table.
But I am new to parsing and html agility p
There is an example of that in the discussion forums here. Scroll down a bit to see the table answer. I do wish they would provide better samples that were easier to find.
EDIT:
To extract data from specific columns you would have to first find the Edit2:
If you don't know the indexes of the columns you could do the whole thing like this. I have not tested this. tags that correspond to the columns you want and remember their indexes. You would then need to find the tags for the same indexes. Assuming you know the indexes of the columns you could do something like this:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml("http://somewhere.com");
HtmlNode table = doc.DocumentNode.SelectSingleNode("//table");
foreach (var row in table.SelectNodes("//tr"))
{
HtmlNode addressNode = row.SelectSingleNode("td[2]");
//do something with address here
HtmlNode phoneNode = row.SelectSingleNode("td[5]");
// do something with phone here
}
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml("http://somewhere.com");
var tables = doc.DocumentNode.SelectNodes("//table");
foreach(var table in tables)
{
int addressIndex = -1;
int phoneIndex = -1;
var headers = table.SelectNodes("//th");
for (int headerIndex = 0; headerIndex < headers.Count(); headerIndex++)
{
if (headers[headerIndex].InnerText == "address")
{
addressIndex = headerIndex;
}
else if (headers[headerIndex].InnerText == "phone")
{
phoneIndex = headerIndex;
}
}
if (addressIndex != -1 && phoneIndex != -1)
{
foreach (var row in table.SelectNodes("//tr"))
{
HtmlNode addressNode = row.SelectSingleNode("td[addressIndex]");
//do something with address here
HtmlNode phoneNode = row.SelectSingleNode("td[phoneIndex]");
// do something with phone here
}
}
}