I have a table that I need to parse, specifically it is a school schedule with 4 blocks of time, and 5 blocks of days for every week. I\'ve attempted to parse it, but honest
UPDATE (removed previous function)
UPDATE2 fixed and simplified.
My first function was wrong. Here's another one, it's working but needs tests:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import defaultdict
def table_to_list(table):
dct = table_to_2d_dict(table)
return list(iter_2d_dict(dct))
def table_to_2d_dict(table):
result = defaultdict(lambda : defaultdict(unicode))
for row_i, row in enumerate(table.xpath('./tr')):
for col_i, col in enumerate(row.xpath('./td|./th')):
colspan = int(col.get('colspan', 1))
rowspan = int(col.get('rowspan', 1))
col_data = col.text_content()
while row_i in result and col_i in result[row_i]:
col_i += 1
for i in range(row_i, row_i + rowspan):
for j in range(col_i, col_i + colspan):
result[i][j] = col_data
return result
def iter_2d_dict(dct):
for i, row in sorted(dct.items()):
cols = []
for j, col in sorted(row.items()):
cols.append(col)
yield cols
if __name__ == '__main__':
import lxml.html
from pprint import pprint
doc = lxml.html.parse('tables.html')
for table_el in doc.xpath('//table'):
table = table_to_list(table_el)
pprint(table)
tables.html:
<table border="1">
<tr>
<td>1 </td>
<td>1 </td>
<td>1 </td>
<td rowspan="4">Thing</td>
<td>1 </td>
</tr>
<tr>
<td>2 </td>
<td>2 </td>
<td>2 </td>
<td>2 </td>
</tr>
<tr>
<td>3 </td>
<td>3 </td>
<td>3 </td>
<td>3 </td>
</tr>
<tr>
<td>4 </td>
<td>4 </td>
<td>4 </td>
<td>4 </td>
</tr>
</table>
<table border="1">
<tr>
<td colspan="2" rowspan="4">#1</td>
<td rowspan="4">#2</td>
<td rowspan="2">#3</td>
<td rowspan="2">#4</td>
</tr>
<tr></tr>
<tr>
<td rowspan="2">#5</td>
<td rowspan="2">#6</td>
</tr>
<tr></tr>
</table>
Output:
[['1 ', '1 ', '1 ', 'Thing', '1 '],
['2 ', '2 ', '2 ', 'Thing', '2 '],
['3 ', '3 ', '3 ', 'Thing', '3 '],
['4 ', '4 ', '4 ', 'Thing', '4 ']]
[['#1', '#1', '#2', '#3', '#4'],
['#1', '#1', '#2', '#3', '#4'],
['#1', '#1', '#2', '#5', '#6'],
['#1', '#1', '#2', '#5', '#6']]
Update: There is a bug in this answer (which is based on reclosedev solution)
See How to parse table with rowspan and colspan
Old:
For those who want a Python 3 and BeautifulSoup solution,
def table_to_2d(table_tag):
rows = table_tag("tr")
cols = rows[0](["td", "th"])
table = [[None] * len(cols) for _ in range(len(rows))]
for row_i, row in enumerate(rows):
for col_i, col in enumerate(row(["td", "th"])):
insert(table, row_i, col_i, col)
return table
def insert(table, row, col, element):
if row >= len(table) or col >= len(table[row]):
return
if table[row][col] is None:
value = element.get_text()
table[row][col] = value
if element.has_attr("colspan"):
span = int(element["colspan"])
for i in range(1, span):
table[row][col+i] = value
if element.has_attr("rowspan"):
span = int(element["rowspan"])
for i in range(1, span):
table[row+i][col] = value
else:
insert(table, row, col + 1, element)
Usage:
soup = BeautifulSoup('<table><tr><th>1</th><th>2</th><th>5</th></tr><tr><td rowspan="2">3</td><td colspan="2">4</td></tr><tr><td>6</td><td>7</td></tr></table>', 'html.parser')
print(table_to_2d(soup.table))
This is NOT optimized. I wrote this for my one-time script.