# Copyright (C) 2009 Jorge J. Lopez C. <lowlifebob[at]gmail[point]com>
# weblog :
http://www.lopz.org
import httplib
import re
from sgmllib import SGMLParser
const_dom = "www.google.com.bo"
const_url = "/search?as_q=%s&hl=es&num=%s&safe=off"
class atag(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [x for y, x in attrs if y=='href']
if href:
self.urls.extend(href)
def download(dom, url, query, num):
headers = {
"User-Agent": "Mozilla/5.0 (X11; U; Linux i686; es-ES; \
rv:1.9.0.3) Gecko/2008092416 Firefox/3.0.3",
"Accept": "text/html,application/xhtml+xml,application/ \
xml;q=0.9,*/*;q=0.8",
"Accept-Language": "es-es,es;q=0.8,en-us;q=0.5,en;q=0.3",
"Accept-Charset": "UTF-8,*",
"Keep-Alive": "300",
"Proxy-Connection": "keep-alive",
}
# conectamos con el servidor
conn = httplib.HTTPConnection(dom, 80)
# hacemos la peticion a la imagen
conn.request ("GET", url % (query, num), None, headers)
resp = conn.getresponse()
return resp
def get_links(dom, url, query, num):
query = query.replace(' ','+')
ident = re.compile("related:")
listalinks = []
#descargamos la pagina
resp = download(dom, url, query, num)
html = resp.read()
#extraemos todos los links
links = atag()
links.feed(html)
links.close()
for x in links.urls:
if ident.search(x):
url = x.replace("/search?hl=es&safe=off&q=related:","http://")
listalinks.append(url)
return listalinks
query = raw_input('Palabra a buscar :')
#num = raw_input('Numero de resultados: ')
links = get_links(const_dom, const_url, query, num=10)
for link in links:
print link