Source code for burst.parser.ehp

# coding: utf-8
# Name:        ehp.py
# Author:      Iury de oliveira gomes figueiredo and Mancuniancol
# Created on:  28.11.2016
# Licence:     GPL v.3: http://www.gnu.org/copyleft/gpl.html

""""
All the credit of this code to Iury de oliveira gomes figueiredo
Easy Html Parser is an AST generator for html/xml documents. You can easily delete/insert/extract tags in html/xml
documents as well as look for patterns.
https://github.com/iogf/ehp
"""

from HTMLParser import HTMLParser
from collections import deque

version = '1.3a'
DATA = 1
META = 2
COMMENT = 3
PI = 4
CODE = 5
AMP = 6


[docs]class Attribute(dict): """ This class holds the tags's attributes. The idea consists in providing an efficient and flexible way of manipulating tags attributes inside the dom. Example: dom = Html().feed('<p style="color:green"> foo </p>') for ind in dom.sail(): if ind.name == 'p': ind.attr['style'] = "color:blue" It would change to color blue. """ def __getitem__(self, key): # """ # If self doesn't have the key it returns "" # """ result = self.get(key, None) return "" if result is None else result def __str__(self): # """ # It returns a htmlized representation for attributes # which are inside self. # """ data = '' for key, value in self.items(): pair = '%s="%s" ' % (key, value) data += pair return data
[docs]class Root(list): """ A Root instance is the outmost node for a xml/html document. All xml/html entities inherit from this class. html = Html() dom = html.feed('<html> ... </body>') dom.name == '' True type(dom) == Root True """ def __init__(self, name=None, attr=None): # """ # """ if attr is None: attr = {} self.name = name self.attr = Attribute(attr) list.__init__(list(self)) __repr__ = object.__repr__ def __str__(self): """ This str function returns a string representation of the structure. """ html = '' for ind in self: html = '%s%s' % (html, ind) return html def __call__(self, tag=None, order=1, select=None, attribute='text', divider=('', 1)): """ It returns the text for a specific tag, order and matching the attributes in select. data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> <p style="color:green"> gamma.</p> </body><a href="www.google.com">hello</a>' html = Html() dom = html.feed(data) print dom(tag='p', select=('style', 'color:green')): Output. beta print dom(tag='p', select=('style', 'color:green'), order=2): Output. gamma print dom(tag='a', select=('style', 'color:green'), attribute="href"): Output. wwww.google """ value_attrib = '' if self is not None: if tag is not None: if isinstance(select, tuple): select = [select] values_tag = self.find(tag) if select is None else self.find(tag, 1, 1, *select) cm = 0 value_tag = None for item_tag in values_tag: cm += 1 if cm == order: value_tag = item_tag break value_tag = value_tag if value_tag is not None else None else: value_tag = self if value_tag is not None: if attribute is 'text': value_attrib = value_tag.text() else: value_attrib = value_tag.attr[attribute] else: return '' if value_attrib is not None: value_attrib = value_attrib.strip() else: value_attrib = '' if divider[0] != '': result = value_attrib.split(divider[0]) if len(result) > divider[1]: return result[divider[1]].strip() else: return '' return value_attrib def __getitem__(self, item): return self.attr[item]
[docs] def sail(self): """ This is used to navigate through the xml/html document. Every xml/html object is represented by a python class instance that inherits from Root. The method sail is used to return an iterator for these objects. Example: data = '<a> <b> </b> </a>' html = Html() dom = html.feed(data) for ind in dom.sail(): print type(ind),',', ind.name It would output. <class 'ehp.Root'> , a <class 'ehp.Root'> , b """ for i in self[:]: for j in i.sail(): yield (j) yield (i)
[docs] def index(self, item, **kwargs): """ This is similar to index but uses id to check for equality. Example: data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for root, ind in dom.sail_with_root(): print root.name, ind.name, root.index(ind) It would print. a b 0 a b 1 a 0 The line where it appears ' a 0' corresponds to the outmost object. The outmost object is an instance of Root that contains all the other objects. :param item: """ count = 0 for ind in self: if ind is item: return count count += 1 raise ValueError
[docs] def remove(self, item): """ This is as list.remove but works with id. data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for root, ind in dom.sail_with_root(): if ind.name == 'b': root.remove(ind) print dom It should print. <a ></a> """ index = self.index(item) del self[index]
[docs] def find(self, name='', every=1, start=1, *args): """ It is used to find all objects that match name. Example 1: data = '<a><b></b><b></b></a>' html = Html() dom = html.feed(data) for ind in dom.find('b'): print ind It should print. <b ></b> <b ></b> Example 2. data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>' html = Html() dom = html.feed(data) for ind in dom.find('p', ('style', 'color:green')): print ind Or for ind in dom.find('p', ('style', ['color:green', 'color:red'])): print ind Output. <p style="color:green" > beta.</p> """ cm = 0 for ind in self.sail(): if ind.name == name: for key, values in args: results = [] for value in (values if isinstance(values, list) else [values]): for item in ind.attr[key].split(): results.append(value != item) if all(results): break else: cm += 1 if cm >= start and (cm - start) % every == 0: yield (ind)
[docs] def find_once(self, tag=None, select=None, order=1): """" It returns the nth (order) ocurrence from the tag matching with the attributes from select """ value_tag = Tag('html') if isinstance(select, tuple): select = [select] if self is not None and tag is not None: values_tag = self.find(tag) if select is None else self.find(tag, 1, 1, *select) cm = 0 value_tag = Tag('html') for item_tag in values_tag: cm += 1 if cm == order: value_tag = item_tag break value_tag = value_tag if value_tag is not None else None return value_tag
[docs] def find_all(self, tag=None, select=None, every=1, start=1): """" It returns all ocurrences from the tag matching with the attributes from select """ result = [] if isinstance(select, tuple): select = [select] if self is not None and tag is not None: elem1 = self.find(tag, every, start) if select is None else self.find(tag, every, start, *select) result = list(elem1) if elem1 is not None else [] return result
[docs] def find_with_root(self, name, *args): """ Like Root.find but returns its parent tag. from ehp import * html = Html() dom = html.feed('''<body> <p> alpha </p> <p> beta </p> </body>''') for root, ind in dom.find_with_root('p'): root.remove(ind) print dom It would output. <body > </body> """ for root, ind in self.sail_with_root(): if ind.name == name: for key, values in args: results = [] for value in (values if isinstance(values, list) else [values]): results.append(ind.attr[key] != value) if all(results): break else: yield (root, ind)
[docs] def by_id(self, id_value): """ It is a shortcut for finding an object whose attribute 'id' matches id. Example: data = '<a><b id="foo"></b></a>' html = Html() dom = html.feed(data) print dom.byid('foo') print dom.byid('bar') It should print. <b id="foo" ></b> None """ return self.take('id', id_value)
[docs] def take(self, *args): """ It returns the first object whose one of its attributes matches (key0, value0), (key1, value1), ... . Example: data = '<a><b id="foo" size="1"></b></a>' html = Html() dom = html.feed(data) print dom.take(('id', 'foo')) print dom.take(('id', 'foo'), ('size', '2')) """ seq = self.match(*args) try: item = seq.next() except StopIteration: return None else: return item
[docs] def take_with_root(self, *args): """ Like Root.take but returns the tag parent. """ seq = self.match_with_root(*args) try: item = seq.next() except StopIteration: return None else: return item pass
[docs] def match(self, *args): """ It returns a sequence of objects whose attributes match. (key0, value0), (key1, value1), ... . Example: data = '<a size="1"><b size="1"></b></a>' html = Html() dom = html.feed(data) for ind in dom.match(('size', '1')): print ind It would print. <b size="1" ></b> <a size="1" ><b size="1" ></b></a> """ for ind in self.sail(): for key, value in args: if ind.attr[key] != value: break else: yield (ind)
[docs] def match_with_root(self, *args): """ Like Root.match but with its parent tag. Example: from ehp import * html = Html() dom = html.feed('''<body> <p style="color:black"> xxx </p> <p style = "color:black"> mmm </p></body>''') for root, ind in dom.match_with_root(('style', 'color:black')): del ind.attr['style'] item = dom.fst('body') item.attr['style'] = 'color:black' print dom Output. <body style="color:black" > <p > xxx </p> <p > mmm </p></body> """ for root, ind in self.sail_with_root(): for key, value in args: if ind.attr[key] != value: break else: yield (root, ind)
[docs] def join(self, delim, *args): """ It joins all the objects whose name appears in args. Example 1: html = Html() data = '<a><b> This is cool. </b><b> That is. </b></a>' dom = html.feed(data) print dom.join('', 'b') print type(dom.join('b')) It would print. <b > This is cool. </b><b > That is. </b> <type 'str'> Example 2: html = Html() data = '<a><b> alpha</b><c>beta</c> <b>gamma</a>' dom = html.feed(data) print dom.join('', 'b', 'c') It would print. <b > alpha</b><c >beta</c><b >gamma</b> Example 3: html = Html() data = '<a><b>alpha</b><c>beta</c><b>gamma</a>' dom = html.feed(data) print dom.join('\\n', DATA) It would print. alpha beta gamma """ data = '' for ind in self.sail(): if ind.name in args: data = '%s%s%s' % (data, delim, ind) return data
[docs] def fst(self, name, *args): """ It returns the first object whose name matches. Example 1: html = Html() data = '<body> <em> Cool. </em></body>' dom = html.feed(data) print dom.fst('em') It outputs. <em > Cool. </em> Example 2: data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>' html = Html() dom = html.feed(data) for ind in dom.find('p', ('style', 'color:green')): print ind print dom.fst('p', ('style', 'color:green')) print dom.fst_with_root('p', ('style', 'color:green')) Output: <p style="color:green" > beta.</p> <p style="color:green" > beta.</p> (<ehp.Tag object at 0xb7216c0c>, <ehp.Tag object at 0xb7216d24>) """ # for ind in self.sail(): # if ind.name == name: # for key, value in args: # if ind.attr[key] != value: # break # else: # return ind seq = self.find(name, 1, 1, *args) try: item = seq.next() except StopIteration: return None else: return item
[docs] def fst_with_root(self, name, *args): """ Like fst but returns its item parent. Example: html = Html() data = '<body> <em> Cool. </em></body>' dom = html.feed(data) root, item dom.fst_with_root('em') root.insert_after(item, Tag('p')) print root It outputs. <body > <em > Cool. </em><p ></p></body> For another similar example, see help(Root.fst) """ # for root, ind in self.sail_with_root(): # if ind.name == name: # for key, value in args: # if ind.attr[key] != value: # break # else: # return root, ind seq = self.find_with_root(name, *args) try: item = seq.next() except StopIteration: return None else: return item
[docs] def text(self): """ It returns all objects whose name matches DATA. It basically returns a string corresponding to all asci characters that are inside a xml/html tag. Example: html = Html() data = '<body><em>This is all the text.</em></body>' dom = html.feed(data) print dom.fst('em').text() It outputs. This is all the text. Notice that if you call text() on an item with children then it returns all the *printable* characters for that node. """ return self.join('', DATA)
[docs] def write(self, filename): """ It saves the structure to a file. """ fd = open(filename, 'w') fd.write(str(self)) fd.close()
[docs] def sail_with_root(self): """ This one works like sail(), however it yields the tag's parents as well as the child tag. For an example, see help(Root.remove). """ for i in self[:]: for j in i.sail_with_root(): yield (j) yield ((self, i))
[docs] def walk(self): """ Like sail but carries name and attr. Example: html = Html() data = '<body> <em> This is all the text.</em></body>' dom = html.feed(data) for ind, name, attr in dom.walk(): print 'TAG:', ind print 'NAME:', name print 'ATTR:', attr It should print. TAG: NAME: 1 ATTR: TAG: This is all the text. NAME: 1 ATTR: TAG: <em > This is all the text.</em> NAME: em ATTR: TAG: <body > <em > This is all the text.</em></body> NAME: body ATTR: """ for ind in self.sail(): yield (ind, ind.name, ind.attr)
[docs] def walk_with_root(self): """ Like walk but carries root. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) for (root, name, attr), (ind, name, attr) in dom.walk_with_root(): print root, name, ind, name Output: <em >alpha</em> 1 alpha 1 <body ><em >alpha</em></body> em <em >alpha</em> em <body ><em >alpha</em></body> body <body ><em >alpha</em></body> body """ for root, ind in self.sail_with_root(): yield ((root, root.name, root.attr), (ind, ind.name, ind.attr))
[docs] def insert_after(self, y, k): """ Insert after a given tag. For an example, see help(Root.fst_with_root). """ ind = self.index(y) self.insert(ind + 1, k)
[docs] def insert_before(self, y, k): """ Insert before a given tag. For a similar example, see help(Root.fst_with_root). """ ind = self.index(y) self.insert(ind, k)
[docs] def parent(self, dom): """ Find the parent tag """ str_item = str(self) for i, j in dom.sail_with_root(): if str(j) == str_item: return i
[docs] def list_(self, text=""): result = [] for i in self[:]: text1 = text + ' ' + str(i.name) class_name = i["class"].replace(" ", ".") if len(class_name) > 0: text1 += "." + class_name id_name = i["id"].replace(" ", "#") if len(id_name) > 0: text1 += "#" + id_name if i.name != 1: result.append((text1.strip(), i)) result.extend(i.list_(text1)) return result
[docs] def select(self, text=""): result = [] for i, j in self.list_(): if i.endswith(text): result.append(j) return result
[docs] def get_attributes(self, text): text = text.replace(' ', '').replace(';', '') for i, j in self.list_(): if text == str(j).replace(' ', ''): return i
[docs]class Tag(Root): """ This class's instances represent xml/html tags under the form: <name key="value" ...> ... </name>. It holds useful methods for parsing xml/html documents. """ def __init__(self, name, attr=None): """ The parameter name is the xml/html tag's name. Example: d = {'style': 'background:blue;'} x = Tag('p', d) """ if attr is None: attr = {} Root.__init__(self, name, attr) def __str__(self): """ This function returns a string representation for a node. """ html = '<%s %s>' % (self.name, self.attr) for ind in self: html = '%s%s' % (html, ind) html += '</%s>' % self.name return html
[docs]class Data(Root): """ The pythonic representation of data that is inside xml/html documents. All data that is not a xml/html token is represented by this class in the structure of the document. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) x = dom.fst('em') # x holds a Data instance. type(x[0]) print x[0] Output: <class 'ehp.Data'> alpha The Data instances are everywhere in the document, when the tokenizer finds them between the xml/html tags it builds up the structure identically to the document. """ def __init__(self, data): """ The data holds the characters. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) x = dom.fst('em') x.append(Data('\nbeta')) It outputs. <body ><em >alpha beta</em></body> """ Root.__init__(self, DATA) self.data = data def __str__(self): """ This function returns a string which correspond to the data inside the Data class. """ return self.data
[docs] def text(self): return self.data
[docs]class XTag(Root): """ This tag is the representation of html's tags in XHTML style like <img src="t.gif" /> It is tags which do not have children. """ def __init__(self, name, attr=None): """ See help(Tag). """ if attr is None: attr = {} Root.__init__(self, name, attr) def __str__(self): html = '<%s %s/>' % (self.name, self.attr) return html
[docs]class Meta(Root): """ """ def __init__(self, data): Root.__init__(self, META) self.data = data def __str__(self): html = '<!%s>' % self.data return html
[docs]class Code(Root): """ """ def __init__(self, data): Root.__init__(self, CODE) self.data = data def __str__(self): html = '&#%s' % self.data return html
[docs]class Amp(Root): """ """ def __init__(self, data): Root.__init__(self, AMP) self.data = data def __str__(self): html = '&%s' % self.data return html
[docs]class Pi(Root): """ """ def __init__(self, data): Root.__init__(self, PI) self.data = data def __str__(self): html = '<?%s>' % self.data return html
[docs]class Comment(Root): """ """ def __init__(self, data): Root.__init__(self, COMMENT) self.data = data def __str__(self): html = '<!--%s-->' % self.data return html
[docs]class Tree(object): """ The engine class. """ def __init__(self): """ Initializes outmost which is the struct which will hold all data inside the file. """ self.outmost = Root('') self.stack = deque() self.stack.append(self.outmost)
[docs] def clear(self): """ Clear the outmost and stack for a new parsing. """ self.outmost = Root('') self.stack.clear() self.stack.append(self.outmost)
[docs] def last(self): """ Return the last pointer which point to the actual tag scope. """ return self.stack[-1]
[docs] def nest(self, name, attr): """ Nest a given tag at the bottom of the tree using the last stack's pointer. """ item = Tag(name, attr) pointer = self.stack.pop() pointer.append(item) self.stack.append(pointer) self.stack.append(item)
[docs] def dnest(self, data): """ Nest the actual data onto the tree. """ top = self.last() item = Data(data) top.append(item)
[docs] def xnest(self, name, attr): """ Nest a XTag onto the tree. """ top = self.last() item = XTag(name, attr) top.append(item)
[docs] def ynest(self, data): """ """ top = self.last() item = Meta(data) top.append(item)
[docs] def mnest(self, data): """ """ top = self.last() item = Comment(data) top.append(item)
[docs] def cnest(self, data): """ """ top = self.last() item = Code(data) top.append(item)
[docs] def rnest(self, data): """ """ top = self.last() item = Amp(data) top.append(item)
[docs] def inest(self, data): """ """ top = self.last() item = Pi(data) top.append(item)
[docs] def enclose(self, name): """ When found a closing tag then pops the pointer's scope from the stack so pointing to the earlier scope's tag. """ count = 0 for ind in reversed(self.stack): count += 1 if ind.name == name: break else: count = 0 # It pops all the items which do not match with the closing tag. for i in xrange(0, count): self.stack.pop()
[docs]class Html(HTMLParser): """ The tokenizer class. """ def __init__(self): HTMLParser.__init__(self) self.structure = Tree()
[docs] def fromfile(self, filename): """ It builds a structure from a file. """ fd = open(filename, 'r') data = fd.read() fd.close() return self.feed(data)
[docs] def feed(self, data): """ """ self.structure.clear() HTMLParser.feed(self, data) return self.structure.outmost
[docs] def handle_starttag(self, name, attr): """ When found an opening tag then nest it onto the tree """ self.structure.nest(name, attr) pass
[docs] def handle_startendtag(self, name, attr): """ When found a XHTML tag style then nest it up to the tree """ self.structure.xnest(name, attr)
[docs] def handle_endtag(self, name): """ When found a closing tag then makes it point to the right scope """ self.structure.enclose(name) pass
[docs] def handle_data(self, data): """ Nest data onto the tree. """ self.structure.dnest(data)
[docs] def handle_decl(self, decl): """ """ self.structure.ynest(decl)
[docs] def unknown_decl(self, decl): """ """ self.structure.ynest(decl)
[docs] def handle_charref(self, data): """ """ self.structure.cnest(data)
[docs] def handle_entityref(self, data): """ """ self.structure.rnest(data)
[docs] def handle_pi(self, data): """ """ self.structure.inest(data)
[docs] def handle_comment(self, data): """ """ self.structure.mnest(data)