# coding: utf-8
# Name: ehp.py
# Author: Iury de oliveira gomes figueiredo and Mancuniancol
# Created on: 28.11.2016
# Licence: GPL v.3: http://www.gnu.org/copyleft/gpl.html
""""
All the credit of this code to Iury de oliveira gomes figueiredo
Easy Html Parser is an AST generator for html/xml documents. You can easily delete/insert/extract tags in html/xml
documents as well as look for patterns.
https://github.com/iogf/ehp
"""
from HTMLParser import HTMLParser
from collections import deque
version = '1.3a'
DATA = 1
META = 2
COMMENT = 3
PI = 4
CODE = 5
AMP = 6
[docs]class Attribute(dict):
"""
This class holds the tags's attributes.
The idea consists in providing an efficient and flexible way of manipulating
tags attributes inside the dom.
Example:
dom = Html().feed('<p style="color:green"> foo </p>')
for ind in dom.sail():
if ind.name == 'p':
ind.attr['style'] = "color:blue"
It would change to color blue.
"""
def __getitem__(self, key):
# """
# If self doesn't have the key it returns ""
# """
result = self.get(key, None)
return "" if result is None else result
def __str__(self):
# """
# It returns a htmlized representation for attributes
# which are inside self.
# """
data = ''
for key, value in self.items():
pair = '%s="%s" ' % (key, value)
data += pair
return data
[docs]class Root(list):
"""
A Root instance is the outmost node for a xml/html document.
All xml/html entities inherit from this class.
html = Html()
dom = html.feed('<html> ... </body>')
dom.name == ''
True
type(dom) == Root
True
"""
def __init__(self, name=None, attr=None):
# """
# """
if attr is None:
attr = {}
self.name = name
self.attr = Attribute(attr)
list.__init__(list(self))
__repr__ = object.__repr__
def __str__(self):
"""
This str function returns a string representation of the structure.
"""
html = ''
for ind in self:
html = '%s%s' % (html, ind)
return html
def __call__(self, tag=None, order=1, select=None, attribute='text', divider=('', 1)):
"""
It returns the text for a specific tag, order and matching the attributes in select.
data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p>
<p style="color:green"> gamma.</p> </body><a href="www.google.com">hello</a>'
html = Html()
dom = html.feed(data)
print dom(tag='p', select=('style', 'color:green')):
Output.
beta
print dom(tag='p', select=('style', 'color:green'), order=2):
Output.
gamma
print dom(tag='a', select=('style', 'color:green'), attribute="href"):
Output.
wwww.google
"""
value_attrib = ''
if self is not None:
if tag is not None:
if isinstance(select, tuple):
select = [select]
values_tag = self.find(tag) if select is None else self.find(tag, 1, 1, *select)
cm = 0
value_tag = None
for item_tag in values_tag:
cm += 1
if cm == order:
value_tag = item_tag
break
value_tag = value_tag if value_tag is not None else None
else:
value_tag = self
if value_tag is not None:
if attribute is 'text':
value_attrib = value_tag.text()
else:
value_attrib = value_tag.attr[attribute]
else:
return ''
if value_attrib is not None:
value_attrib = value_attrib.strip()
else:
value_attrib = ''
if divider[0] != '':
result = value_attrib.split(divider[0])
if len(result) > divider[1]:
return result[divider[1]].strip()
else:
return ''
return value_attrib
def __getitem__(self, item):
return self.attr[item]
[docs] def sail(self):
"""
This is used to navigate through the xml/html document.
Every xml/html object is represented by a python class
instance that inherits from Root.
The method sail is used to return an iterator
for these objects.
Example:
data = '<a> <b> </b> </a>'
html = Html()
dom = html.feed(data)
for ind in dom.sail():
print type(ind),',', ind.name
It would output.
<class 'ehp.Root'> , a
<class 'ehp.Root'> , b
"""
for i in self[:]:
for j in i.sail():
yield (j)
yield (i)
[docs] def index(self, item, **kwargs):
"""
This is similar to index but uses id
to check for equality.
Example:
data = '<a><b></b><b></b></a>'
html = Html()
dom = html.feed(data)
for root, ind in dom.sail_with_root():
print root.name, ind.name, root.index(ind)
It would print.
a b 0
a b 1
a 0
The line where it appears ' a 0' corresponds to the
outmost object. The outmost object is an instance of Root
that contains all the other objects.
:param item:
"""
count = 0
for ind in self:
if ind is item:
return count
count += 1
raise ValueError
[docs] def remove(self, item):
"""
This is as list.remove but works with id.
data = '<a><b></b><b></b></a>'
html = Html()
dom = html.feed(data)
for root, ind in dom.sail_with_root():
if ind.name == 'b':
root.remove(ind)
print dom
It should print.
<a ></a>
"""
index = self.index(item)
del self[index]
[docs] def find(self, name='', every=1, start=1, *args):
"""
It is used to find all objects that match name.
Example 1:
data = '<a><b></b><b></b></a>'
html = Html()
dom = html.feed(data)
for ind in dom.find('b'):
print ind
It should print.
<b ></b>
<b ></b>
Example 2.
data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>'
html = Html()
dom = html.feed(data)
for ind in dom.find('p', ('style', 'color:green')):
print ind
Or
for ind in dom.find('p', ('style', ['color:green', 'color:red'])):
print ind
Output.
<p style="color:green" > beta.</p>
"""
cm = 0
for ind in self.sail():
if ind.name == name:
for key, values in args:
results = []
for value in (values if isinstance(values, list) else [values]):
for item in ind.attr[key].split():
results.append(value != item)
if all(results):
break
else:
cm += 1
if cm >= start and (cm - start) % every == 0:
yield (ind)
[docs] def find_once(self, tag=None, select=None, order=1):
""""
It returns the nth (order) ocurrence from the tag matching with the attributes from select
"""
value_tag = Tag('html')
if isinstance(select, tuple):
select = [select]
if self is not None and tag is not None:
values_tag = self.find(tag) if select is None else self.find(tag, 1, 1, *select)
cm = 0
value_tag = Tag('html')
for item_tag in values_tag:
cm += 1
if cm == order:
value_tag = item_tag
break
value_tag = value_tag if value_tag is not None else None
return value_tag
[docs] def find_all(self, tag=None, select=None, every=1, start=1):
""""
It returns all ocurrences from the tag matching with the attributes from select
"""
result = []
if isinstance(select, tuple):
select = [select]
if self is not None and tag is not None:
elem1 = self.find(tag, every, start) if select is None else self.find(tag, every, start, *select)
result = list(elem1) if elem1 is not None else []
return result
[docs] def find_with_root(self, name, *args):
"""
Like Root.find but returns its parent tag.
from ehp import *
html = Html()
dom = html.feed('''<body> <p> alpha </p> <p> beta </p> </body>''')
for root, ind in dom.find_with_root('p'):
root.remove(ind)
print dom
It would output.
<body > </body>
"""
for root, ind in self.sail_with_root():
if ind.name == name:
for key, values in args:
results = []
for value in (values if isinstance(values, list) else [values]):
results.append(ind.attr[key] != value)
if all(results):
break
else:
yield (root, ind)
[docs] def by_id(self, id_value):
"""
It is a shortcut for finding an object
whose attribute 'id' matches id.
Example:
data = '<a><b id="foo"></b></a>'
html = Html()
dom = html.feed(data)
print dom.byid('foo')
print dom.byid('bar')
It should print.
<b id="foo" ></b>
None
"""
return self.take('id', id_value)
[docs] def take(self, *args):
"""
It returns the first object whose one of its
attributes matches (key0, value0), (key1, value1), ... .
Example:
data = '<a><b id="foo" size="1"></b></a>'
html = Html()
dom = html.feed(data)
print dom.take(('id', 'foo'))
print dom.take(('id', 'foo'), ('size', '2'))
"""
seq = self.match(*args)
try:
item = seq.next()
except StopIteration:
return None
else:
return item
[docs] def take_with_root(self, *args):
"""
Like Root.take but returns the tag parent.
"""
seq = self.match_with_root(*args)
try:
item = seq.next()
except StopIteration:
return None
else:
return item
pass
[docs] def match(self, *args):
"""
It returns a sequence of objects whose attributes match.
(key0, value0), (key1, value1), ... .
Example:
data = '<a size="1"><b size="1"></b></a>'
html = Html()
dom = html.feed(data)
for ind in dom.match(('size', '1')):
print ind
It would print.
<b size="1" ></b>
<a size="1" ><b size="1" ></b></a>
"""
for ind in self.sail():
for key, value in args:
if ind.attr[key] != value:
break
else:
yield (ind)
[docs] def match_with_root(self, *args):
"""
Like Root.match but with its parent tag.
Example:
from ehp import *
html = Html()
dom = html.feed('''<body> <p style="color:black"> xxx </p>
<p style = "color:black"> mmm </p></body>''')
for root, ind in dom.match_with_root(('style', 'color:black')):
del ind.attr['style']
item = dom.fst('body')
item.attr['style'] = 'color:black'
print dom
Output.
<body style="color:black" > <p > xxx </p>
<p > mmm </p></body>
"""
for root, ind in self.sail_with_root():
for key, value in args:
if ind.attr[key] != value:
break
else:
yield (root, ind)
[docs] def join(self, delim, *args):
"""
It joins all the objects whose name appears in args.
Example 1:
html = Html()
data = '<a><b> This is cool. </b><b> That is. </b></a>'
dom = html.feed(data)
print dom.join('', 'b')
print type(dom.join('b'))
It would print.
<b > This is cool. </b><b > That is. </b>
<type 'str'>
Example 2:
html = Html()
data = '<a><b> alpha</b><c>beta</c> <b>gamma</a>'
dom = html.feed(data)
print dom.join('', 'b', 'c')
It would print.
<b > alpha</b><c >beta</c><b >gamma</b>
Example 3:
html = Html()
data = '<a><b>alpha</b><c>beta</c><b>gamma</a>'
dom = html.feed(data)
print dom.join('\\n', DATA)
It would print.
alpha
beta
gamma
"""
data = ''
for ind in self.sail():
if ind.name in args:
data = '%s%s%s' % (data, delim, ind)
return data
[docs] def fst(self, name, *args):
"""
It returns the first object whose name
matches.
Example 1:
html = Html()
data = '<body> <em> Cool. </em></body>'
dom = html.feed(data)
print dom.fst('em')
It outputs.
<em > Cool. </em>
Example 2:
data = '<body> <p> alpha. </p> <p style="color:green"> beta.</p> </body>'
html = Html()
dom = html.feed(data)
for ind in dom.find('p', ('style', 'color:green')):
print ind
print dom.fst('p', ('style', 'color:green'))
print dom.fst_with_root('p', ('style', 'color:green'))
Output:
<p style="color:green" > beta.</p>
<p style="color:green" > beta.</p>
(<ehp.Tag object at 0xb7216c0c>, <ehp.Tag object at 0xb7216d24>)
"""
# for ind in self.sail():
# if ind.name == name:
# for key, value in args:
# if ind.attr[key] != value:
# break
# else:
# return ind
seq = self.find(name, 1, 1, *args)
try:
item = seq.next()
except StopIteration:
return None
else:
return item
[docs] def fst_with_root(self, name, *args):
"""
Like fst but returns its item parent.
Example:
html = Html()
data = '<body> <em> Cool. </em></body>'
dom = html.feed(data)
root, item dom.fst_with_root('em')
root.insert_after(item, Tag('p'))
print root
It outputs.
<body > <em > Cool. </em><p ></p></body>
For another similar example, see help(Root.fst)
"""
# for root, ind in self.sail_with_root():
# if ind.name == name:
# for key, value in args:
# if ind.attr[key] != value:
# break
# else:
# return root, ind
seq = self.find_with_root(name, *args)
try:
item = seq.next()
except StopIteration:
return None
else:
return item
[docs] def text(self):
"""
It returns all objects whose name matches DATA.
It basically returns a string corresponding
to all asci characters that are inside a xml/html
tag.
Example:
html = Html()
data = '<body><em>This is all the text.</em></body>'
dom = html.feed(data)
print dom.fst('em').text()
It outputs.
This is all the text.
Notice that if you call text() on an item with
children then it returns all the *printable* characters
for that node.
"""
return self.join('', DATA)
[docs] def write(self, filename):
"""
It saves the structure to a file.
"""
fd = open(filename, 'w')
fd.write(str(self))
fd.close()
[docs] def sail_with_root(self):
"""
This one works like sail(), however it yields the tag's parents as
well as the child tag.
For an example, see help(Root.remove).
"""
for i in self[:]:
for j in i.sail_with_root():
yield (j)
yield ((self, i))
[docs] def walk(self):
"""
Like sail but carries name and attr.
Example:
html = Html()
data = '<body> <em> This is all the text.</em></body>'
dom = html.feed(data)
for ind, name, attr in dom.walk():
print 'TAG:', ind
print 'NAME:', name
print 'ATTR:', attr
It should print.
TAG:
NAME: 1
ATTR:
TAG: This is all the text.
NAME: 1
ATTR:
TAG: <em > This is all the text.</em>
NAME: em
ATTR:
TAG: <body > <em > This is all the text.</em></body>
NAME: body
ATTR:
"""
for ind in self.sail():
yield (ind, ind.name, ind.attr)
[docs] def walk_with_root(self):
"""
Like walk but carries root.
Example:
html = Html()
data = '<body><em>alpha</em></body>'
dom = html.feed(data)
for (root, name, attr), (ind, name, attr) in dom.walk_with_root():
print root, name, ind, name
Output:
<em >alpha</em> 1 alpha 1
<body ><em >alpha</em></body> em <em >alpha</em> em
<body ><em >alpha</em></body> body <body ><em >alpha</em></body> body
"""
for root, ind in self.sail_with_root():
yield ((root, root.name, root.attr),
(ind, ind.name, ind.attr))
[docs] def insert_after(self, y, k):
"""
Insert after a given tag.
For an example, see help(Root.fst_with_root).
"""
ind = self.index(y)
self.insert(ind + 1, k)
[docs] def insert_before(self, y, k):
"""
Insert before a given tag.
For a similar example, see help(Root.fst_with_root).
"""
ind = self.index(y)
self.insert(ind, k)
[docs] def parent(self, dom):
"""
Find the parent tag
"""
str_item = str(self)
for i, j in dom.sail_with_root():
if str(j) == str_item:
return i
[docs] def list_(self, text=""):
result = []
for i in self[:]:
text1 = text + ' ' + str(i.name)
class_name = i["class"].replace(" ", ".")
if len(class_name) > 0:
text1 += "." + class_name
id_name = i["id"].replace(" ", "#")
if len(id_name) > 0:
text1 += "#" + id_name
if i.name != 1:
result.append((text1.strip(), i))
result.extend(i.list_(text1))
return result
[docs] def select(self, text=""):
result = []
for i, j in self.list_():
if i.endswith(text):
result.append(j)
return result
[docs] def get_attributes(self, text):
text = text.replace(' ', '').replace(';', '')
for i, j in self.list_():
if text == str(j).replace(' ', ''):
return i
[docs]class Tag(Root):
"""
This class's instances represent xml/html tags under the form:
<name key="value" ...> ... </name>.
It holds useful methods for parsing xml/html documents.
"""
def __init__(self, name, attr=None):
"""
The parameter name is the xml/html tag's name.
Example:
d = {'style': 'background:blue;'}
x = Tag('p', d)
"""
if attr is None:
attr = {}
Root.__init__(self, name, attr)
def __str__(self):
"""
This function returns a string representation for a node.
"""
html = '<%s %s>' % (self.name, self.attr)
for ind in self:
html = '%s%s' % (html, ind)
html += '</%s>' % self.name
return html
[docs]class Data(Root):
"""
The pythonic representation of data that is inside xml/html documents.
All data that is not a xml/html token is represented by this class in the
structure of the document.
Example:
html = Html()
data = '<body><em>alpha</em></body>'
dom = html.feed(data)
x = dom.fst('em')
# x holds a Data instance.
type(x[0])
print x[0]
Output:
<class 'ehp.Data'>
alpha
The Data instances are everywhere in the document, when
the tokenizer finds them between the xml/html tags it builds
up the structure identically to the document.
"""
def __init__(self, data):
"""
The data holds the characters.
Example:
html = Html()
data = '<body><em>alpha</em></body>'
dom = html.feed(data)
x = dom.fst('em')
x.append(Data('\nbeta'))
It outputs.
<body ><em >alpha
beta</em></body>
"""
Root.__init__(self, DATA)
self.data = data
def __str__(self):
"""
This function returns a string which correspond to the data inside the
Data class.
"""
return self.data
[docs] def text(self):
return self.data
[docs]class XTag(Root):
"""
This tag is the representation of html's tags in XHTML style like <img src="t.gif" />
It is tags which do not have children.
"""
def __init__(self, name, attr=None):
"""
See help(Tag).
"""
if attr is None:
attr = {}
Root.__init__(self, name, attr)
def __str__(self):
html = '<%s %s/>' % (self.name, self.attr)
return html
[docs]class Code(Root):
"""
"""
def __init__(self, data):
Root.__init__(self, CODE)
self.data = data
def __str__(self):
html = '&#%s' % self.data
return html
[docs]class Amp(Root):
"""
"""
def __init__(self, data):
Root.__init__(self, AMP)
self.data = data
def __str__(self):
html = '&%s' % self.data
return html
[docs]class Pi(Root):
"""
"""
def __init__(self, data):
Root.__init__(self, PI)
self.data = data
def __str__(self):
html = '<?%s>' % self.data
return html
[docs]class Tree(object):
"""
The engine class.
"""
def __init__(self):
"""
Initializes outmost which is the struct which will
hold all data inside the file.
"""
self.outmost = Root('')
self.stack = deque()
self.stack.append(self.outmost)
[docs] def clear(self):
"""
Clear the outmost and stack for a new parsing.
"""
self.outmost = Root('')
self.stack.clear()
self.stack.append(self.outmost)
[docs] def last(self):
"""
Return the last pointer which point to the actual tag scope.
"""
return self.stack[-1]
[docs] def nest(self, name, attr):
"""
Nest a given tag at the bottom of the tree using
the last stack's pointer.
"""
item = Tag(name, attr)
pointer = self.stack.pop()
pointer.append(item)
self.stack.append(pointer)
self.stack.append(item)
[docs] def dnest(self, data):
"""
Nest the actual data onto the tree.
"""
top = self.last()
item = Data(data)
top.append(item)
[docs] def xnest(self, name, attr):
"""
Nest a XTag onto the tree.
"""
top = self.last()
item = XTag(name, attr)
top.append(item)
[docs] def ynest(self, data):
"""
"""
top = self.last()
item = Meta(data)
top.append(item)
[docs] def mnest(self, data):
"""
"""
top = self.last()
item = Comment(data)
top.append(item)
[docs] def cnest(self, data):
"""
"""
top = self.last()
item = Code(data)
top.append(item)
[docs] def rnest(self, data):
"""
"""
top = self.last()
item = Amp(data)
top.append(item)
[docs] def inest(self, data):
"""
"""
top = self.last()
item = Pi(data)
top.append(item)
[docs] def enclose(self, name):
"""
When found a closing tag then pops the pointer's scope from the stack
so pointing to the earlier scope's tag.
"""
count = 0
for ind in reversed(self.stack):
count += 1
if ind.name == name:
break
else:
count = 0
# It pops all the items which do not match with the closing tag.
for i in xrange(0, count):
self.stack.pop()
[docs]class Html(HTMLParser):
"""
The tokenizer class.
"""
def __init__(self):
HTMLParser.__init__(self)
self.structure = Tree()
[docs] def fromfile(self, filename):
"""
It builds a structure from a file.
"""
fd = open(filename, 'r')
data = fd.read()
fd.close()
return self.feed(data)
[docs] def feed(self, data):
"""
"""
self.structure.clear()
HTMLParser.feed(self, data)
return self.structure.outmost
[docs] def handle_starttag(self, name, attr):
"""
When found an opening tag then nest it onto the tree
"""
self.structure.nest(name, attr)
pass
[docs] def handle_startendtag(self, name, attr):
"""
When found a XHTML tag style then nest it up to the tree
"""
self.structure.xnest(name, attr)
[docs] def handle_endtag(self, name):
"""
When found a closing tag then makes it point to the right scope
"""
self.structure.enclose(name)
pass
[docs] def handle_data(self, data):
"""
Nest data onto the tree.
"""
self.structure.dnest(data)
[docs] def handle_decl(self, decl):
"""
"""
self.structure.ynest(decl)
[docs] def unknown_decl(self, decl):
"""
"""
self.structure.ynest(decl)
[docs] def handle_charref(self, data):
"""
"""
self.structure.cnest(data)
[docs] def handle_entityref(self, data):
"""
"""
self.structure.rnest(data)
[docs] def handle_pi(self, data):
"""
"""
self.structure.inest(data)