0 | 0 |
#!/usr/bin/env python
|
1 | 1 |
|
2 | |
# Python bindings to the Google search engine
|
3 | |
# Copyright (c) 2009-2019, Mario Vilas
|
|
2 |
# Copyright (c) 2009-2020, Mario Vilas
|
4 | 3 |
# All rights reserved.
|
5 | 4 |
#
|
6 | 5 |
# Redistribution and use in source and binary forms, with or without
|
|
31 | 30 |
import random
|
32 | 31 |
import sys
|
33 | 32 |
import time
|
|
33 |
import ssl
|
34 | 34 |
|
35 | 35 |
if sys.version_info[0] > 2:
|
36 | 36 |
from http.cookiejar import LWPCookieJar
|
|
54 | 54 |
# Main search function.
|
55 | 55 |
'search',
|
56 | 56 |
|
57 | |
# Specialized search functions.
|
58 | |
'search_images', 'search_news',
|
59 | |
'search_videos', 'search_shop',
|
60 | |
'search_books', 'search_apps',
|
61 | |
|
62 | 57 |
# Shortcut for "get lucky" search.
|
63 | 58 |
'lucky',
|
64 | 59 |
|
|
69 | 64 |
# URL templates to make Google searches.
|
70 | 65 |
url_home = "https://www.google.%(tld)s/"
|
71 | 66 |
url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
72 | |
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
|
|
67 |
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
73 | 68 |
"cr=%(country)s"
|
74 | 69 |
url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
75 | |
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
|
|
70 |
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
|
76 | 71 |
"cr=%(country)s"
|
77 | 72 |
url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
78 | 73 |
"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
79 | |
"tbm=%(tpe)s&cr=%(country)s"
|
|
74 |
"cr=%(country)s"
|
80 | 75 |
url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
|
81 | 76 |
"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
|
82 | |
"safe=%(safe)s&tbm=%(tpe)s&cr=%(country)s"
|
|
77 |
"safe=%(safe)s&cr=%(country)s"
|
83 | 78 |
url_parameters = (
|
84 | |
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'tbm', 'cr')
|
|
79 |
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
|
85 | 80 |
|
86 | 81 |
# Cookie jar. Stored at the user's home folder.
|
87 | 82 |
# If the cookie jar is inaccessible, the errors are ignored.
|
|
152 | 147 |
|
153 | 148 |
# Request the given URL and return the response page, using the cookie jar.
|
154 | 149 |
# If the cookie jar is inaccessible, the errors are ignored.
|
155 | |
def get_page(url, user_agent=None):
|
|
150 |
def get_page(url, user_agent=None, verify_ssl=True):
|
156 | 151 |
"""
|
157 | 152 |
Request the given URL and return the response page, using the cookie jar.
|
158 | 153 |
|
159 | 154 |
:param str url: URL to retrieve.
|
160 | 155 |
:param str user_agent: User agent for the HTTP requests.
|
161 | 156 |
Use None for the default.
|
|
157 |
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
158 |
traffic interception attacks. Defaults to True.
|
162 | 159 |
|
163 | 160 |
:rtype: str
|
164 | 161 |
:return: Web page retrieved for the given URL.
|
|
172 | 169 |
request = Request(url)
|
173 | 170 |
request.add_header('User-Agent', user_agent)
|
174 | 171 |
cookie_jar.add_cookie_header(request)
|
175 | |
response = urlopen(request)
|
|
172 |
if verify_ssl:
|
|
173 |
response = urlopen(request)
|
|
174 |
else:
|
|
175 |
context = ssl._create_unverified_context()
|
|
176 |
response = urlopen(request, context=context)
|
176 | 177 |
cookie_jar.extract_cookies(response, request)
|
177 | 178 |
html = response.read()
|
178 | 179 |
response.close()
|
|
207 | 208 |
|
208 | 209 |
# Returns a generator that yields URLs.
|
209 | 210 |
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
|
210 | |
stop=None, domains=None, pause=2.0, tpe='', country='',
|
211 | |
extra_params=None, user_agent=None):
|
|
211 |
stop=None, pause=2.0, country='', extra_params=None,
|
|
212 |
user_agent=None, verify_ssl=True):
|
212 | 213 |
"""
|
213 | 214 |
Search the given query string using Google.
|
214 | 215 |
|
|
222 | 223 |
:param int start: First result to retrieve.
|
223 | 224 |
:param int stop: Last result to retrieve.
|
224 | 225 |
Use None to keep searching forever.
|
225 | |
:param list domains: A list of web domains to constrain
|
226 | |
the search.
|
227 | 226 |
:param float pause: Lapse to wait between HTTP requests.
|
228 | 227 |
A lapse too long will make the search slow, but a lapse too short may
|
229 | 228 |
cause Google to block your IP. Your mileage may vary!
|
230 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
231 | |
Use the following values {videos: 'vid', images: 'isch',
|
232 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
233 | 229 |
:param str country: Country or region to focus the search on. Similar to
|
234 | 230 |
changing the TLD, but does not yield exactly the same results.
|
235 | 231 |
Only Google knows why...
|
|
239 | 235 |
{'filter': '0'} which will append '&filter=0' to every query.
|
240 | 236 |
:param str user_agent: User agent for the HTTP requests.
|
241 | 237 |
Use None for the default.
|
|
238 |
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
239 |
traffic interception attacks. Defaults to True.
|
242 | 240 |
|
243 | 241 |
:rtype: generator of str
|
244 | 242 |
:return: Generator (iterator) that yields found URLs.
|
|
250 | 248 |
|
251 | 249 |
# Count the number of links yielded.
|
252 | 250 |
count = 0
|
253 | |
|
254 | |
# Prepare domain list if it exists.
|
255 | |
if domains:
|
256 | |
query = query + ' ' + ' OR '.join(
|
257 | |
'site:' + domain for domain in domains)
|
258 | 251 |
|
259 | 252 |
# Prepare the search string.
|
260 | 253 |
query = quote_plus(query)
|
|
275 | 268 |
)
|
276 | 269 |
|
277 | 270 |
# Grab the cookie from the home page.
|
278 | |
get_page(url_home % vars(), user_agent)
|
|
271 |
get_page(url_home % vars(), user_agent, verify_ssl)
|
279 | 272 |
|
280 | 273 |
# Prepare the URL of the first request.
|
281 | 274 |
if start:
|
|
308 | 301 |
time.sleep(pause)
|
309 | 302 |
|
310 | 303 |
# Request the Google Search results page.
|
311 | |
html = get_page(url, user_agent)
|
|
304 |
html = get_page(url, user_agent, verify_ssl)
|
312 | 305 |
|
313 | 306 |
# Parse the response and get every anchored URL.
|
314 | 307 |
if is_bs4:
|
|
368 | 361 |
url = url_next_page_num % vars()
|
369 | 362 |
|
370 | 363 |
|
371 | |
# Shortcut to search images.
|
372 | |
# Beware, this does not return the image link.
|
373 | |
def search_images(*args, **kwargs):
|
374 | |
"""
|
375 | |
Shortcut to search images.
|
376 | |
|
377 | |
Same arguments and return value as the main search function.
|
378 | |
|
379 | |
:note: Beware, this does not return the image link.
|
380 | |
"""
|
381 | |
kwargs['tpe'] = 'isch'
|
382 | |
return search(*args, **kwargs)
|
383 | |
|
384 | |
|
385 | |
# Shortcut to search news.
|
386 | |
def search_news(*args, **kwargs):
|
387 | |
"""
|
388 | |
Shortcut to search news.
|
389 | |
|
390 | |
Same arguments and return value as the main search function.
|
391 | |
"""
|
392 | |
kwargs['tpe'] = 'nws'
|
393 | |
return search(*args, **kwargs)
|
394 | |
|
395 | |
|
396 | |
# Shortcut to search videos.
|
397 | |
def search_videos(*args, **kwargs):
|
398 | |
"""
|
399 | |
Shortcut to search videos.
|
400 | |
|
401 | |
Same arguments and return value as the main search function.
|
402 | |
"""
|
403 | |
kwargs['tpe'] = 'vid'
|
404 | |
return search(*args, **kwargs)
|
405 | |
|
406 | |
|
407 | |
# Shortcut to search shop.
|
408 | |
def search_shop(*args, **kwargs):
|
409 | |
"""
|
410 | |
Shortcut to search shop.
|
411 | |
|
412 | |
Same arguments and return value as the main search function.
|
413 | |
"""
|
414 | |
kwargs['tpe'] = 'shop'
|
415 | |
return search(*args, **kwargs)
|
416 | |
|
417 | |
|
418 | |
# Shortcut to search books.
|
419 | |
def search_books(*args, **kwargs):
|
420 | |
"""
|
421 | |
Shortcut to search books.
|
422 | |
|
423 | |
Same arguments and return value as the main search function.
|
424 | |
"""
|
425 | |
kwargs['tpe'] = 'bks'
|
426 | |
return search(*args, **kwargs)
|
427 | |
|
428 | |
|
429 | |
# Shortcut to search apps.
|
430 | |
def search_apps(*args, **kwargs):
|
431 | |
"""
|
432 | |
Shortcut to search apps.
|
433 | |
|
434 | |
Same arguments and return value as the main search function.
|
435 | |
"""
|
436 | |
kwargs['tpe'] = 'app'
|
437 | |
return search(*args, **kwargs)
|
438 | |
|
439 | |
|
440 | 364 |
# Shortcut to single-item search.
|
441 | 365 |
# Evaluates the iterator to return the single URL as a string.
|
442 | 366 |
def lucky(*args, **kwargs):
|