Source code for lazyscraper.htmltools

# -*- coding: utf8 -*-
from .consts import *

[docs]def table_to_dict(node, strip_lf=True): """Extracts data from table""" data = [] rows = node.xpath('./tbody/tr') if len(rows) == 0: rows = node.xpath('./tr') for row in rows: cells = [] for cell in row.xpath('(./td|./th)'): inner_tables = cell.xpath('./table') if len(inner_tables) < 1: text = ' '.join(cell.itertext()) #cell.text_content() if strip_lf: text = text.replace('\r',u' ').replace('\n', u' ').strip() cells.append(text) else: cells.append([table_to_dict(node, strip_lf) for t in inner_tables]) data.append(cells) return data
[docs]def taglist_to_dict(tags, fields, strip_lf=True): """Converts list of tags into dict""" has_text = TEXT_FIELD in fields has_tag = TAG_FIELD in fields finfields = fields.copy() data = [] if has_text: finfields.remove(TEXT_FIELD) if has_tag: finfields.remove(TAG_FIELD) for t in tags: item = {} if has_tag: item[TAG_FIELD] = t.tag if has_text: item[TEXT_FIELD] = ' '.join(t.itertext()).strip() if strip_lf: item[TEXT_FIELD] = (' '.join(item[TEXT_FIELD].split())).strip() for f in finfields: item[f] = t.attrib[f].strip() if f in t.attrib.keys() else "" data.append(item) return data