A Python solution that uses only the standard library (takes advantage of the fact that the HTML happens to be well-formed XML). More than one row of data can be handled.
(Tested with Python 2.6 and 2.7. The question was updated saying that the OP uses Python 2.4, so this answer may not be very useful in this case. ElementTree was added in Python 2.5)
from xml.etree.ElementTree import fromstring
HTML = """
<table class="details" border="0" cellpadding="5" cellspacing="2" width="95%">
<tr valign="top">
<th>Tests</th>
<th>Failures</th>
<th>Success Rate</th>
<th>Average Time</th>
<th>Min Time</th>
<th>Max Time</th>
</tr>
<tr valign="top" class="Failure">
<td>103</td>
<td>24</td>
<td>76.70%</td>
<td>71 ms</td>
<td>0 ms</td>
<td>829 ms</td>
</tr>
<tr valign="top" class="whatever">
<td>A</td>
<td>B</td>
<td>C</td>
<td>D</td>
<td>E</td>
<td>F</td>
</tr>
</table>"""
tree = fromstring(HTML)
rows = tree.findall("tr")
headrow = rows[0]
datarows = rows[1:]
for num, h in enumerate(headrow):
data = ", ".join([row[num].text for row in datarows])
print "{0:<16}: {1}".format(h.text, data)
Output:
Tests : 103, A
Failures : 24, B
Success Rate : 76.70%, C
Average Time : 71 ms, D
Min Time : 0 ms, E
Max Time : 829 ms, F
Below is a python regex based solution that I have tested on python 2.7. It doesn't rely on xml module--so will work in case xml is not fully well formed.
import re
# input args: html string
# output: tables as a list, column max length
def extract_html_tables(html):
tables=[]
maxlen=0
rex1=r'<table.*?/table>'
rex2=r'<tr.*?/tr>'
rex3=r'<(td|th).*?/(td|th)>'
s = re.search(rex1,html,re.DOTALL)
while s:
t = s.group() # the table
s2 = re.search(rex2,t,re.DOTALL)
table = []
while s2:
r = s2.group() # the row
s3 = re.search(rex3,r,re.DOTALL)
row=[]
while s3:
d = s3.group() # the cell
#row.append(strip_tags(d).strip() )
row.append(d.strip() )
r = re.sub(rex3,'',r,1,re.DOTALL)
s3 = re.search(rex3,r,re.DOTALL)
table.append( row )
if maxlen<len(row):
maxlen = len(row)
t = re.sub(rex2,'',t,1,re.DOTALL)
s2 = re.search(rex2,t,re.DOTALL)
html = re.sub(rex1,'',html,1,re.DOTALL)
tables.append(table)
s = re.search(rex1,html,re.DOTALL)
return tables, maxlen
html = """
<table class="details" border="0" cellpadding="5" cellspacing="2" width="95%">
<tr valign="top">
<th>Tests</th>
<th>Failures</th>
<th>Success Rate</th>
<th>Average Time</th>
<th>Min Time</th>
<th>Max Time</th>
</tr>
<tr valign="top" class="Failure">
<td>103</td>
<td>24</td>
<td>76.70%</td>
<td>71 ms</td>
<td>0 ms</td>
<td>829 ms</td>
</tr>
</table>"""
print extract_html_tables(html)