Source code for lazyscraper.patterns

# -*- coding: utf8 -*-
from .consts import *
from .htmltools import table_to_dict, taglist_to_dict

[docs]def pattern_extract_simpleul(tree, nodeclass, nodeid, fields): """Simple UL lists extractor pattern""" if nodeclass: xfilter = "//ul[@class='%s']/li//a" % (nodeclass) elif nodeid: xfilter = "//ul[@id='%s']/li//a" % (nodeid) else: xfilter = '//ul/li//a' tags = tree.xpath(xfilter) data = taglist_to_dict(tags, fields) return data
[docs]def pattern_extract_simpleoptions(tree, nodeclass, nodeid, fields): """Simple SELECT / OPTION extractor pattern""" if nodeclass: xfilter = "//select[@class='%s']/option" % (nodeclass) elif nodeid: xfilter = "//select[@id='%s']/option" % (nodeid) else: xfilter = '//select/option' tags = tree.xpath(xfilter) data = taglist_to_dict(tags, fields) return data
[docs]def pattern_extract_exturls(tree, nodeclass, nodeid, fields): """Pattern to extract external urls""" if nodeclass: xfilter = "//a[@class='%s']" % (nodeclass) elif nodeid: xfilter = "//a[@id='%s']" % (nodeid) else: xfilter = '//a' tags = tree.xpath(xfilter) filtered = [] for t in tags: if 'href' in t.attrib.keys(): if t.attrib['href'][:6] in ['http:/', 'https:']: filtered.append(t) data = taglist_to_dict(filtered, fields) return data
[docs]def pattern_extract_forms(tree, nodeclass, nodeid, fields): """Extracts web forms from page""" res = [] formattrlist = ['name', 'id', 'action', 'class', 'method'] inputattrlist = ['name', 'id', 'type', 'class', 'value', 'src', 'size'] textarealist = ['name', 'id', 'size', 'class'] buttonlist = ['name', 'id', 'value', 'class'] selectlist = ['name', 'id', 'multiple', 'size', 'class'] optionlist = ['value', 'selected', 'class'] tagnames = [('input', inputattrlist), ('textarea', textarealist), ('button', buttonlist), ('select', selectlist)] allforms = tree.xpath('//form') for form in allforms: fkey = {} for k in formattrlist: if form.attrib.has_key(k): fkey[k] = form.attrib[k]# for tag in form.iterdescendants(): if not hasattr(tag, 'tag'): continue for tagname, tlist in tagnames: if tag.tag == tagname: if not tagname in fkey.keys(): fkey[tagname] = [] tval = {'text' : tag.text} for k in tlist: if tag.attrib.has_key(k): tval[k] = tag.attrib[k] if tag.tag == 'select': tval['options'] = [] options = tag.xpath('option') for o in options: optionval = {'text' : o.text} for k in optionlist: if o.attrib.has_key(k): optionval[k] = o.attrib[k] tval['options'].append(optionval) fkey[tagname].append(tval) res.append(fkey) return {'total' : len(res), 'list' : res}
PATTERNS = { 'simpleul' : {'func' : pattern_extract_simpleul, 'deffields' : DEFAULT_URL_FIELDS, 'json_only' : False }, 'simpleopt' : {'func' : pattern_extract_simpleoptions, 'deffields' : DEFAULT_SELECT_FIELDS , 'json_only' : False }, 'exturls' : {'func' : pattern_extract_exturls, 'deffields' : DEFAULT_URL_FIELDS, 'json_only' : False }, 'getforms' : {'func' : pattern_extract_forms, 'deffields' : None, 'json_only' : True }, }