# -*- coding: utf8 -*-
from .consts import *
[docs]def table_to_dict(node, strip_lf=True):
"""Extracts data from table"""
data = []
rows = node.xpath('./tbody/tr')
if len(rows) == 0:
rows = node.xpath('./tr')
for row in rows:
cells = []
for cell in row.xpath('(./td|./th)'):
inner_tables = cell.xpath('./table')
if len(inner_tables) < 1:
text = ' '.join(cell.itertext()) #cell.text_content()
if strip_lf:
text = text.replace('\r',u' ').replace('\n', u' ').strip()
cells.append(text)
else:
cells.append([table_to_dict(node, strip_lf) for t in inner_tables])
data.append(cells)
return data
[docs]def taglist_to_dict(tags, fields, strip_lf=True):
"""Converts list of tags into dict"""
has_text = TEXT_FIELD in fields
has_tag = TAG_FIELD in fields
finfields = fields.copy()
data = []
if has_text: finfields.remove(TEXT_FIELD)
if has_tag: finfields.remove(TAG_FIELD)
for t in tags:
item = {}
if has_tag:
item[TAG_FIELD] = t.tag
if has_text:
item[TEXT_FIELD] = ' '.join(t.itertext()).strip()
if strip_lf:
item[TEXT_FIELD] = (' '.join(item[TEXT_FIELD].split())).strip()
for f in finfields:
item[f] = t.attrib[f].strip() if f in t.attrib.keys() else ""
data.append(item)
return data