0 | 0 |
#!/usr/bin/env python
|
1 | 1 |
|
2 | |
# Python bindings to the Google search engine
|
3 | |
# Copyright (c) 2009-2018, Mario Vilas
|
|
2 |
# Copyright (c) 2009-2020, Mario Vilas
|
4 | 3 |
# All rights reserved.
|
5 | 4 |
#
|
6 | 5 |
# Redistribution and use in source and binary forms, with or without
|
|
31 | 30 |
import random
|
32 | 31 |
import sys
|
33 | 32 |
import time
|
34 | |
import math
|
|
33 |
import ssl
|
35 | 34 |
|
36 | 35 |
if sys.version_info[0] > 2:
|
37 | 36 |
from http.cookiejar import LWPCookieJar
|
|
55 | 54 |
# Main search function.
|
56 | 55 |
'search',
|
57 | 56 |
|
58 | |
# Specialized search functions.
|
59 | |
'search_images', 'search_news',
|
60 | |
'search_videos', 'search_shop',
|
61 | |
'search_books', 'search_apps',
|
62 | |
|
63 | 57 |
# Shortcut for "get lucky" search.
|
64 | 58 |
'lucky',
|
65 | 59 |
|
66 | |
# Computations based on the number of Google hits.
|
67 | |
'hits', 'ngd',
|
68 | |
|
69 | 60 |
# Miscellaneous utility functions.
|
70 | |
'get_random_user_agent',
|
|
61 |
'get_random_user_agent', 'get_tbs',
|
71 | 62 |
]
|
72 | 63 |
|
73 | 64 |
# URL templates to make Google searches.
|
74 | 65 |
url_home = "https://www.google.%(tld)s/"
|
75 | 66 |
url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
76 | |
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
|
|
67 |
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
68 |
"cr=%(country)s"
|
77 | 69 |
url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
78 | |
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
|
|
70 |
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
|
|
71 |
"cr=%(country)s"
|
79 | 72 |
url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
|
80 | 73 |
"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
|
81 | |
"tbm=%(tpe)s"
|
|
74 |
"cr=%(country)s"
|
82 | 75 |
url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
|
83 | 76 |
"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
|
84 | |
"safe=%(safe)s&tbm=%(tpe)s"
|
|
77 |
"safe=%(safe)s&cr=%(country)s"
|
|
78 |
url_parameters = (
|
|
79 |
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
|
85 | 80 |
|
86 | 81 |
# Cookie jar. Stored at the user's home folder.
|
|
82 |
# If the cookie jar is inaccessible, the errors are ignored.
|
87 | 83 |
home_folder = os.getenv('HOME')
|
88 | 84 |
if not home_folder:
|
89 | 85 |
home_folder = os.getenv('USERHOME')
|
|
99 | 95 |
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
|
100 | 96 |
|
101 | 97 |
# Load the list of valid user agents from the install folder.
|
|
98 |
# The search order is:
|
|
99 |
# * user_agents.txt.gz
|
|
100 |
# * user_agents.txt
|
|
101 |
# * default user agent
|
102 | 102 |
try:
|
103 | 103 |
install_folder = os.path.abspath(os.path.split(__file__)[0])
|
104 | 104 |
try:
|
|
129 | 129 |
return random.choice(user_agents_list)
|
130 | 130 |
|
131 | 131 |
|
|
132 |
# Helper function to format the tbs parameter.
|
|
133 |
def get_tbs(from_date, to_date):
|
|
134 |
"""
|
|
135 |
Helper function to format the tbs parameter.
|
|
136 |
|
|
137 |
:param datetime.date from_date: Python date object.
|
|
138 |
:param datetime.date to_date: Python date object.
|
|
139 |
|
|
140 |
:rtype: str
|
|
141 |
:return: Dates encoded in tbs format.
|
|
142 |
"""
|
|
143 |
from_date = from_date.strftime('%m/%d/%Y')
|
|
144 |
to_date = to_date.strftime('%m/%d/%Y')
|
|
145 |
return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
|
|
146 |
|
|
147 |
|
132 | 148 |
# Request the given URL and return the response page, using the cookie jar.
|
133 | |
def get_page(url, user_agent=None):
|
|
149 |
# If the cookie jar is inaccessible, the errors are ignored.
|
|
150 |
def get_page(url, user_agent=None, verify_ssl=True):
|
134 | 151 |
"""
|
135 | 152 |
Request the given URL and return the response page, using the cookie jar.
|
136 | 153 |
|
137 | 154 |
:param str url: URL to retrieve.
|
138 | 155 |
:param str user_agent: User agent for the HTTP requests.
|
139 | 156 |
Use None for the default.
|
|
157 |
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
158 |
traffic interception attacks. Defaults to True.
|
140 | 159 |
|
141 | 160 |
:rtype: str
|
142 | 161 |
:return: Web page retrieved for the given URL.
|
|
150 | 169 |
request = Request(url)
|
151 | 170 |
request.add_header('User-Agent', user_agent)
|
152 | 171 |
cookie_jar.add_cookie_header(request)
|
153 | |
response = urlopen(request)
|
|
172 |
if verify_ssl:
|
|
173 |
response = urlopen(request)
|
|
174 |
else:
|
|
175 |
context = ssl._create_unverified_context()
|
|
176 |
response = urlopen(request, context=context)
|
154 | 177 |
cookie_jar.extract_cookies(response, request)
|
155 | 178 |
html = response.read()
|
156 | 179 |
response.close()
|
|
166 | 189 |
def filter_result(link):
|
167 | 190 |
try:
|
168 | 191 |
|
169 | |
# Valid results are absolute URLs not pointing to a Google domain
|
170 | |
# like images.google.com or googleusercontent.com
|
|
192 |
# Decode hidden URLs.
|
|
193 |
if link.startswith('/url?'):
|
|
194 |
o = urlparse(link, 'http')
|
|
195 |
link = parse_qs(o.query)['q'][0]
|
|
196 |
|
|
197 |
# Valid results are absolute URLs not pointing to a Google domain,
|
|
198 |
# like images.google.com or googleusercontent.com for example.
|
|
199 |
# TODO this could be improved!
|
171 | 200 |
o = urlparse(link, 'http')
|
172 | 201 |
if o.netloc and 'google' not in o.netloc:
|
173 | 202 |
return link
|
174 | 203 |
|
175 | |
# Decode hidden URLs.
|
176 | |
if link.startswith('/url?'):
|
177 | |
link = parse_qs(o.query)['q'][0]
|
178 | |
|
179 | |
# Valid results are absolute URLs not pointing to a Google domain
|
180 | |
# like images.google.com or googleusercontent.com
|
181 | |
o = urlparse(link, 'http')
|
182 | |
if o.netloc and 'google' not in o.netloc:
|
183 | |
return link
|
184 | |
|
185 | |
# Otherwise, or on error, return None.
|
|
204 |
# On error, return None.
|
186 | 205 |
except Exception:
|
187 | 206 |
pass
|
188 | |
return None
|
189 | 207 |
|
190 | 208 |
|
191 | 209 |
# Returns a generator that yields URLs.
|
192 | 210 |
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
|
193 | |
stop=None, domains=None, pause=2.0, only_standard=False,
|
194 | |
extra_params={}, tpe='', user_agent=None):
|
|
211 |
stop=None, pause=2.0, country='', extra_params=None,
|
|
212 |
user_agent=None, verify_ssl=True):
|
195 | 213 |
"""
|
196 | 214 |
Search the given query string using Google.
|
197 | 215 |
|
|
203 | 221 |
:param str safe: Safe search.
|
204 | 222 |
:param int num: Number of results per page.
|
205 | 223 |
:param int start: First result to retrieve.
|
206 | |
:param int or None stop: Last result to retrieve.
|
|
224 |
:param int stop: Last result to retrieve.
|
207 | 225 |
Use None to keep searching forever.
|
208 | |
:param list of str or None domains: A list of web domains to constrain
|
209 | |
the search.
|
210 | 226 |
:param float pause: Lapse to wait between HTTP requests.
|
211 | 227 |
A lapse too long will make the search slow, but a lapse too short may
|
212 | 228 |
cause Google to block your IP. Your mileage may vary!
|
213 | |
:param bool only_standard: If True, only returns the standard results from
|
214 | |
each page. If False, it returns every possible link from each page,
|
215 | |
except for those that point back to Google itself. Defaults to False
|
216 | |
for backwards compatibility with older versions of this module.
|
217 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
|
229 |
:param str country: Country or region to focus the search on. Similar to
|
|
230 |
changing the TLD, but does not yield exactly the same results.
|
|
231 |
Only Google knows why...
|
|
232 |
:param dict extra_params: A dictionary of extra HTTP GET
|
218 | 233 |
parameters, which must be URL encoded. For example if you don't want
|
219 | 234 |
Google to filter similar results you can set the extra_params to
|
220 | 235 |
{'filter': '0'} which will append '&filter=0' to every query.
|
221 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
222 | |
Use the following values {videos: 'vid', images: 'isch',
|
223 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
224 | |
:param str or None user_agent: User agent for the HTTP requests.
|
|
236 |
:param str user_agent: User agent for the HTTP requests.
|
225 | 237 |
Use None for the default.
|
|
238 |
:param bool verify_ssl: Verify the SSL certificate to prevent
|
|
239 |
traffic interception attacks. Defaults to True.
|
226 | 240 |
|
227 | 241 |
:rtype: generator of str
|
228 | 242 |
:return: Generator (iterator) that yields found URLs.
|
|
232 | 246 |
# This is used to avoid repeated results.
|
233 | 247 |
hashes = set()
|
234 | 248 |
|
235 | |
# Count the number of links yielded
|
|
249 |
# Count the number of links yielded.
|
236 | 250 |
count = 0
|
237 | |
|
238 | |
# Prepare domain list if it exists.
|
239 | |
if domains:
|
240 | |
query = query + ' ' + ' OR '.join(
|
241 | |
'site:' + domain for domain in domains)
|
242 | 251 |
|
243 | 252 |
# Prepare the search string.
|
244 | 253 |
query = quote_plus(query)
|
245 | 254 |
|
246 | |
# Check extra_params for overlapping
|
247 | |
for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
|
|
255 |
# If no extra_params is given, create an empty dictionary.
|
|
256 |
# We should avoid using an empty dictionary as a default value
|
|
257 |
# in a function parameter in Python.
|
|
258 |
if not extra_params:
|
|
259 |
extra_params = {}
|
|
260 |
|
|
261 |
# Check extra_params for overlapping.
|
|
262 |
for builtin_param in url_parameters:
|
248 | 263 |
if builtin_param in extra_params.keys():
|
249 | 264 |
raise ValueError(
|
250 | 265 |
'GET parameter "%s" is overlapping with \
|
|
253 | 268 |
)
|
254 | 269 |
|
255 | 270 |
# Grab the cookie from the home page.
|
256 | |
get_page(url_home % vars(), user_agent)
|
|
271 |
get_page(url_home % vars(), user_agent, verify_ssl)
|
257 | 272 |
|
258 | 273 |
# Prepare the URL of the first request.
|
259 | 274 |
if start:
|
|
269 | 284 |
|
270 | 285 |
# Loop until we reach the maximum result, if any (otherwise, loop forever).
|
271 | 286 |
while not stop or count < stop:
|
272 | |
# Remeber last count to detect the end of results
|
|
287 |
|
|
288 |
# Remeber last count to detect the end of results.
|
273 | 289 |
last_count = count
|
274 | 290 |
|
275 | |
try: # Is it python<3?
|
276 | |
iter_extra_params = extra_params.iteritems()
|
277 | |
except AttributeError: # Or python>3?
|
278 | |
iter_extra_params = extra_params.items()
|
279 | |
# Append extra GET_parameters to URL
|
280 | |
for k, v in iter_extra_params:
|
281 | |
url += url + ('&%s=%s' % (k, v))
|
|
291 |
# Append extra GET parameters to the URL.
|
|
292 |
# This is done on every iteration because we're
|
|
293 |
# rebuilding the entire URL at the end of this loop.
|
|
294 |
for k, v in extra_params.items():
|
|
295 |
k = quote_plus(k)
|
|
296 |
v = quote_plus(v)
|
|
297 |
url = url + ('&%s=%s' % (k, v))
|
282 | 298 |
|
283 | 299 |
# Sleep between requests.
|
|
300 |
# Keeps Google from banning you for making too many requests.
|
284 | 301 |
time.sleep(pause)
|
285 | 302 |
|
286 | 303 |
# Request the Google Search results page.
|
287 | |
html = get_page(url, user_agent)
|
288 | |
|
289 | |
# Parse the response and process every anchored URL.
|
|
304 |
html = get_page(url, user_agent, verify_ssl)
|
|
305 |
|
|
306 |
# Parse the response and get every anchored URL.
|
290 | 307 |
if is_bs4:
|
291 | 308 |
soup = BeautifulSoup(html, 'html.parser')
|
292 | 309 |
else:
|
|
294 | 311 |
try:
|
295 | 312 |
anchors = soup.find(id='search').findAll('a')
|
296 | 313 |
# Sometimes (depending on the User-agent) there is
|
297 | |
# no id "search" in html response
|
|
314 |
# no id "search" in html response...
|
298 | 315 |
except AttributeError:
|
299 | |
# Remove links of the top bar
|
|
316 |
# Remove links of the top bar.
|
300 | 317 |
gbar = soup.find(id='gbar')
|
301 | 318 |
if gbar:
|
302 | 319 |
gbar.clear()
|
303 | 320 |
anchors = soup.findAll('a')
|
|
321 |
|
|
322 |
# Process every anchored URL.
|
304 | 323 |
for a in anchors:
|
305 | |
|
306 | |
# Leave only the "standard" results if requested.
|
307 | |
# Otherwise grab all possible links.
|
308 | |
if only_standard and (
|
309 | |
not a.parent or a.parent.name.lower() != "h3"):
|
310 | |
continue
|
311 | 324 |
|
312 | 325 |
# Get the URL from the anchor tag.
|
313 | 326 |
try:
|
|
329 | 342 |
# Yield the result.
|
330 | 343 |
yield link
|
331 | 344 |
|
|
345 |
# Increase the results counter.
|
|
346 |
# If we reached the limit, stop.
|
332 | 347 |
count += 1
|
333 | 348 |
if stop and count >= stop:
|
334 | 349 |
return
|
335 | 350 |
|
336 | 351 |
# End if there are no more results.
|
|
352 |
# XXX TODO review this logic, not sure if this is still true!
|
337 | 353 |
if last_count == count:
|
338 | 354 |
break
|
339 | 355 |
|
|
345 | 361 |
url = url_next_page_num % vars()
|
346 | 362 |
|
347 | 363 |
|
348 | |
# Shortcut to search images.
|
349 | |
# Beware, this does not return the image link.
|
350 | |
def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
|
351 | |
start=0, stop=None, pause=2.0, domains=None,
|
352 | |
only_standard=False, extra_params={}):
|
353 | |
"""
|
354 | |
Shortcut to search images.
|
355 | |
|
356 | |
:note: Beware, this does not return the image link.
|
357 | |
|
358 | |
:param str query: Query string. Must NOT be url-encoded.
|
359 | |
:param str tld: Top level domain.
|
360 | |
:param str lang: Language.
|
361 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
362 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
363 | |
:param str safe: Safe search.
|
364 | |
:param int num: Number of results per page.
|
365 | |
:param int start: First result to retrieve.
|
366 | |
:param int or None stop: Last result to retrieve.
|
367 | |
Use None to keep searching forever.
|
368 | |
:param list of str or None domains: A list of web domains to constrain
|
369 | |
the search.
|
370 | |
:param float pause: Lapse to wait between HTTP requests.
|
371 | |
A lapse too long will make the search slow, but a lapse too short may
|
372 | |
cause Google to block your IP. Your mileage may vary!
|
373 | |
:param bool only_standard: If True, only returns the standard results from
|
374 | |
each page. If False, it returns every possible link from each page,
|
375 | |
except for those that point back to Google itself. Defaults to False
|
376 | |
for backwards compatibility with older versions of this module.
|
377 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
378 | |
parameters, which must be URL encoded. For example if you don't want
|
379 | |
Google to filter similar results you can set the extra_params to
|
380 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
381 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
382 | |
Use the following values {videos: 'vid', images: 'isch',
|
383 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
384 | |
:param str or None user_agent: User agent for the HTTP requests.
|
385 | |
Use None for the default.
|
386 | |
|
387 | |
:rtype: generator of str
|
388 | |
:return: Generator (iterator) that yields found URLs.
|
389 | |
If the stop parameter is None the iterator will loop forever.
|
390 | |
"""
|
391 | |
return search(query, tld, lang, tbs, safe, num, start, stop, domains,
|
392 | |
pause, only_standard, extra_params, tpe='isch')
|
393 | |
|
394 | |
|
395 | |
# Shortcut to search news.
|
396 | |
def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
|
397 | |
start=0, stop=None, domains=None, pause=2.0,
|
398 | |
only_standard=False, extra_params={}):
|
399 | |
"""
|
400 | |
Shortcut to search news.
|
401 | |
|
402 | |
:param str query: Query string. Must NOT be url-encoded.
|
403 | |
:param str tld: Top level domain.
|
404 | |
:param str lang: Language.
|
405 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
406 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
407 | |
:param str safe: Safe search.
|
408 | |
:param int num: Number of results per page.
|
409 | |
:param int start: First result to retrieve.
|
410 | |
:param int or None stop: Last result to retrieve.
|
411 | |
Use None to keep searching forever.
|
412 | |
:param list of str or None domains: A list of web domains to constrain
|
413 | |
the search.
|
414 | |
:param float pause: Lapse to wait between HTTP requests.
|
415 | |
A lapse too long will make the search slow, but a lapse too short may
|
416 | |
cause Google to block your IP. Your mileage may vary!
|
417 | |
:param bool only_standard: If True, only returns the standard results from
|
418 | |
each page. If False, it returns every possible link from each page,
|
419 | |
except for those that point back to Google itself. Defaults to False
|
420 | |
for backwards compatibility with older versions of this module.
|
421 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
422 | |
parameters, which must be URL encoded. For example if you don't want
|
423 | |
Google to filter similar results you can set the extra_params to
|
424 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
425 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
426 | |
Use the following values {videos: 'vid', images: 'isch',
|
427 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
428 | |
:param str or None user_agent: User agent for the HTTP requests.
|
429 | |
Use None for the default.
|
430 | |
|
431 | |
:rtype: generator of str
|
432 | |
:return: Generator (iterator) that yields found URLs.
|
433 | |
If the stop parameter is None the iterator will loop forever.
|
434 | |
"""
|
435 | |
return search(query, tld, lang, tbs, safe, num, start, stop, domains,
|
436 | |
pause, only_standard, extra_params, tpe='nws')
|
437 | |
|
438 | |
|
439 | |
# Shortcut to search videos.
|
440 | |
def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
|
441 | |
start=0, stop=None, domains=None, pause=2.0,
|
442 | |
only_standard=False, extra_params={}):
|
443 | |
"""
|
444 | |
Shortcut to search videos.
|
445 | |
|
446 | |
:param str query: Query string. Must NOT be url-encoded.
|
447 | |
:param str tld: Top level domain.
|
448 | |
:param str lang: Language.
|
449 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
450 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
451 | |
:param str safe: Safe search.
|
452 | |
:param int num: Number of results per page.
|
453 | |
:param int start: First result to retrieve.
|
454 | |
:param int or None stop: Last result to retrieve.
|
455 | |
Use None to keep searching forever.
|
456 | |
:param list of str or None domains: A list of web domains to constrain
|
457 | |
the search.
|
458 | |
:param float pause: Lapse to wait between HTTP requests.
|
459 | |
A lapse too long will make the search slow, but a lapse too short may
|
460 | |
cause Google to block your IP. Your mileage may vary!
|
461 | |
:param bool only_standard: If True, only returns the standard results from
|
462 | |
each page. If False, it returns every possible link from each page,
|
463 | |
except for those that point back to Google itself. Defaults to False
|
464 | |
for backwards compatibility with older versions of this module.
|
465 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
466 | |
parameters, which must be URL encoded. For example if you don't want
|
467 | |
Google to filter similar results you can set the extra_params to
|
468 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
469 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
470 | |
Use the following values {videos: 'vid', images: 'isch',
|
471 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
472 | |
:param str or None user_agent: User agent for the HTTP requests.
|
473 | |
Use None for the default.
|
474 | |
|
475 | |
:rtype: generator of str
|
476 | |
:return: Generator (iterator) that yields found URLs.
|
477 | |
If the stop parameter is None the iterator will loop forever.
|
478 | |
"""
|
479 | |
return search(query, tld, lang, tbs, safe, num, start, stop, domains,
|
480 | |
pause, only_standard, extra_params, tpe='vid')
|
481 | |
|
482 | |
|
483 | |
# Shortcut to search shop.
|
484 | |
def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
|
485 | |
start=0, stop=None, domains=None, pause=2.0,
|
486 | |
only_standard=False, extra_params={}):
|
487 | |
"""
|
488 | |
Shortcut to search shop.
|
489 | |
|
490 | |
:param str query: Query string. Must NOT be url-encoded.
|
491 | |
:param str tld: Top level domain.
|
492 | |
:param str lang: Language.
|
493 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
494 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
495 | |
:param str safe: Safe search.
|
496 | |
:param int num: Number of results per page.
|
497 | |
:param int start: First result to retrieve.
|
498 | |
:param int or None stop: Last result to retrieve.
|
499 | |
Use None to keep searching forever.
|
500 | |
:param list of str or None domains: A list of web domains to constrain
|
501 | |
the search.
|
502 | |
:param float pause: Lapse to wait between HTTP requests.
|
503 | |
A lapse too long will make the search slow, but a lapse too short may
|
504 | |
cause Google to block your IP. Your mileage may vary!
|
505 | |
:param bool only_standard: If True, only returns the standard results from
|
506 | |
each page. If False, it returns every possible link from each page,
|
507 | |
except for those that point back to Google itself. Defaults to False
|
508 | |
for backwards compatibility with older versions of this module.
|
509 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
510 | |
parameters, which must be URL encoded. For example if you don't want
|
511 | |
Google to filter similar results you can set the extra_params to
|
512 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
513 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
514 | |
Use the following values {videos: 'vid', images: 'isch',
|
515 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
516 | |
:param str or None user_agent: User agent for the HTTP requests.
|
517 | |
Use None for the default.
|
518 | |
|
519 | |
:rtype: generator of str
|
520 | |
:return: Generator (iterator) that yields found URLs.
|
521 | |
If the stop parameter is None the iterator will loop forever.
|
522 | |
"""
|
523 | |
return search(query, tld, lang, tbs, safe, num, start, stop, domains,
|
524 | |
pause, only_standard, extra_params, tpe='shop')
|
525 | |
|
526 | |
|
527 | |
# Shortcut to search books.
|
528 | |
def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
|
529 | |
start=0, stop=None, domains=None, pause=2.0,
|
530 | |
only_standard=False, extra_params={}):
|
531 | |
"""
|
532 | |
Shortcut to search books.
|
533 | |
|
534 | |
:param str query: Query string. Must NOT be url-encoded.
|
535 | |
:param str tld: Top level domain.
|
536 | |
:param str lang: Language.
|
537 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
538 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
539 | |
:param str safe: Safe search.
|
540 | |
:param int num: Number of results per page.
|
541 | |
:param int start: First result to retrieve.
|
542 | |
:param int or None stop: Last result to retrieve.
|
543 | |
Use None to keep searching forever.
|
544 | |
:param list of str or None domains: A list of web domains to constrain
|
545 | |
the search.
|
546 | |
:param float pause: Lapse to wait between HTTP requests.
|
547 | |
A lapse too long will make the search slow, but a lapse too short may
|
548 | |
cause Google to block your IP. Your mileage may vary!
|
549 | |
:param bool only_standard: If True, only returns the standard results from
|
550 | |
each page. If False, it returns every possible link from each page,
|
551 | |
except for those that point back to Google itself. Defaults to False
|
552 | |
for backwards compatibility with older versions of this module.
|
553 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
554 | |
parameters, which must be URL encoded. For example if you don't want
|
555 | |
Google to filter similar results you can set the extra_params to
|
556 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
557 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
558 | |
Use the following values {videos: 'vid', images: 'isch',
|
559 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
560 | |
:param str or None user_agent: User agent for the HTTP requests.
|
561 | |
Use None for the default.
|
562 | |
|
563 | |
:rtype: generator of str
|
564 | |
:return: Generator (iterator) that yields found URLs.
|
565 | |
If the stop parameter is None the iterator will loop forever.
|
566 | |
"""
|
567 | |
return search(query, tld, lang, tbs, safe, num, start, stop, domains,
|
568 | |
pause, only_standard, extra_params, tpe='bks')
|
569 | |
|
570 | |
|
571 | |
# Shortcut to search apps.
|
572 | |
def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
|
573 | |
start=0, stop=None, domains=None, pause=2.0,
|
574 | |
only_standard=False, extra_params={}):
|
575 | |
"""
|
576 | |
Shortcut to search apps.
|
577 | |
|
578 | |
:param str query: Query string. Must NOT be url-encoded.
|
579 | |
:param str tld: Top level domain.
|
580 | |
:param str lang: Language.
|
581 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
582 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
583 | |
:param str safe: Safe search.
|
584 | |
:param int num: Number of results per page.
|
585 | |
:param int start: First result to retrieve.
|
586 | |
:param int or None stop: Last result to retrieve.
|
587 | |
Use None to keep searching forever.
|
588 | |
:param list of str or None domains: A list of web domains to constrain
|
589 | |
the search.
|
590 | |
:param float pause: Lapse to wait between HTTP requests.
|
591 | |
A lapse too long will make the search slow, but a lapse too short may
|
592 | |
cause Google to block your IP. Your mileage may vary!
|
593 | |
:param bool only_standard: If True, only returns the standard results from
|
594 | |
each page. If False, it returns every possible link from each page,
|
595 | |
except for those that point back to Google itself. Defaults to False
|
596 | |
for backwards compatibility with older versions of this module.
|
597 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
598 | |
parameters, which must be URL encoded. For example if you don't want
|
599 | |
Google to filter similar results you can set the extra_params to
|
600 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
601 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
602 | |
Use the following values {videos: 'vid', images: 'isch',
|
603 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
604 | |
:param str or None user_agent: User agent for the HTTP requests.
|
605 | |
Use None for the default.
|
606 | |
|
607 | |
:rtype: generator of str
|
608 | |
:return: Generator (iterator) that yields found URLs.
|
609 | |
If the stop parameter is None the iterator will loop forever.
|
610 | |
"""
|
611 | |
return search(query, tld, lang, tbs, safe, num, start, stop, domains,
|
612 | |
pause, only_standard, extra_params, tpe='app')
|
613 | |
|
614 | |
|
615 | 364 |
# Shortcut to single-item search.
|
616 | 365 |
# Evaluates the iterator to return the single URL as a string.
|
617 | |
def lucky(query, tld='com', lang='en', tbs='0', safe='off',
|
618 | |
only_standard=False, extra_params={}, tpe=''):
|
|
366 |
def lucky(*args, **kwargs):
|
619 | 367 |
"""
|
620 | 368 |
Shortcut to single-item search.
|
621 | 369 |
|
622 | |
:param str query: Query string. Must NOT be url-encoded.
|
623 | |
:param str tld: Top level domain.
|
624 | |
:param str lang: Language.
|
625 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
626 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
627 | |
:param str safe: Safe search.
|
628 | |
:param int num: Number of results per page.
|
629 | |
:param int start: First result to retrieve.
|
630 | |
:param int or None stop: Last result to retrieve.
|
631 | |
Use None to keep searching forever.
|
632 | |
:param list of str or None domains: A list of web domains to constrain
|
633 | |
the search.
|
634 | |
:param float pause: Lapse to wait between HTTP requests.
|
635 | |
A lapse too long will make the search slow, but a lapse too short may
|
636 | |
cause Google to block your IP. Your mileage may vary!
|
637 | |
:param bool only_standard: If True, only returns the standard results from
|
638 | |
each page. If False, it returns every possible link from each page,
|
639 | |
except for those that point back to Google itself. Defaults to False
|
640 | |
for backwards compatibility with older versions of this module.
|
641 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
642 | |
parameters, which must be URL encoded. For example if you don't want
|
643 | |
Google to filter similar results you can set the extra_params to
|
644 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
645 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
646 | |
Use the following values {videos: 'vid', images: 'isch',
|
647 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
648 | |
:param str or None user_agent: User agent for the HTTP requests.
|
649 | |
Use None for the default.
|
|
370 |
Same arguments as the main search function, but the return value changes.
|
650 | 371 |
|
651 | 372 |
:rtype: str
|
652 | 373 |
:return: URL found by Google.
|
653 | 374 |
"""
|
654 | |
gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
|
655 | |
extra_params, tpe)
|
656 | |
return next(gen)
|
657 | |
|
658 | |
|
659 | |
# Returns only the number of Google hits for the given search query.
|
660 | |
# This is the number reported by Google itself, NOT by scraping.
|
661 | |
def hits(query, tld='com', lang='en', tbs='0', safe='off',
|
662 | |
domains=None, extra_params={}, tpe='', user_agent=None):
|
663 | |
"""
|
664 | |
Search the given query string using Google and return the number of hits.
|
665 | |
|
666 | |
:note: This is the number reported by Google itself, NOT by scraping.
|
667 | |
|
668 | |
:param str query: Query string. Must NOT be url-encoded.
|
669 | |
:param str tld: Top level domain.
|
670 | |
:param str lang: Language.
|
671 | |
:param str tbs: Time limits (i.e "qdr:h" => last hour,
|
672 | |
"qdr:d" => last 24 hours, "qdr:m" => last month).
|
673 | |
:param str safe: Safe search.
|
674 | |
:param int num: Number of results per page.
|
675 | |
:param int start: First result to retrieve.
|
676 | |
:param int or None stop: Last result to retrieve.
|
677 | |
Use None to keep searching forever.
|
678 | |
:param list of str or None domains: A list of web domains to constrain
|
679 | |
the search.
|
680 | |
:param float pause: Lapse to wait between HTTP requests.
|
681 | |
A lapse too long will make the search slow, but a lapse too short may
|
682 | |
cause Google to block your IP. Your mileage may vary!
|
683 | |
:param bool only_standard: If True, only returns the standard results from
|
684 | |
each page. If False, it returns every possible link from each page,
|
685 | |
except for those that point back to Google itself. Defaults to False
|
686 | |
for backwards compatibility with older versions of this module.
|
687 | |
:param dict of str to str extra_params: A dictionary of extra HTTP GET
|
688 | |
parameters, which must be URL encoded. For example if you don't want
|
689 | |
Google to filter similar results you can set the extra_params to
|
690 | |
{'filter': '0'} which will append '&filter=0' to every query.
|
691 | |
:param str tpe: Search type (images, videos, news, shopping, books, apps)
|
692 | |
Use the following values {videos: 'vid', images: 'isch',
|
693 | |
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
|
694 | |
:param str or None user_agent: User agent for the HTTP requests.
|
695 | |
Use None for the default.
|
696 | |
|
697 | |
:rtype: int
|
698 | |
:return: Number of Google hits for the given search query.
|
699 | |
"""
|
700 | |
|
701 | |
# Prepare domain list if it exists.
|
702 | |
if domains:
|
703 | |
domain_query = '+OR+'.join('site:' + domain for domain in domains)
|
704 | |
domain_query = '+' + domain_query
|
705 | |
else:
|
706 | |
domain_query = ''
|
707 | |
|
708 | |
# Prepare the search string.
|
709 | |
query = quote_plus(query + domain_query)
|
710 | |
|
711 | |
# Check extra_params for overlapping
|
712 | |
for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
|
713 | |
if builtin_param in extra_params.keys():
|
714 | |
raise ValueError(
|
715 | |
'GET parameter "%s" is overlapping with \
|
716 | |
the built-in GET parameter',
|
717 | |
builtin_param
|
718 | |
)
|
719 | |
|
720 | |
# Grab the cookie from the home page.
|
721 | |
get_page(url_home % vars(), user_agent)
|
722 | |
|
723 | |
# Prepare the URL of the first (and in this cases ONLY) request.
|
724 | |
url = url_search % vars()
|
725 | |
|
726 | |
try: # Is it python<3?
|
727 | |
iter_extra_params = extra_params.iteritems()
|
728 | |
except AttributeError: # Or python>3?
|
729 | |
iter_extra_params = extra_params.items()
|
730 | |
# Append extra GET_parameters to URL
|
731 | |
for k, v in iter_extra_params:
|
732 | |
url += url + ('&%s=%s' % (k, v))
|
733 | |
|
734 | |
# Request the Google Search results page.
|
735 | |
html = get_page(url, user_agent)
|
736 | |
|
737 | |
# Parse the response.
|
738 | |
if is_bs4:
|
739 | |
soup = BeautifulSoup(html, 'html.parser')
|
740 | |
else:
|
741 | |
soup = BeautifulSoup(html)
|
742 | |
|
743 | |
# Get the number of hits.
|
744 | |
tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
|
745 | |
hits_text_parts = tag.text.split()
|
746 | |
if len(hits_text_parts) < 3:
|
747 | |
return 0
|
748 | |
return int(hits_text_parts[1].replace(',', '').replace('.', ''))
|
749 | |
|
750 | |
|
751 | |
def ngd(term1, term2):
|
752 | |
"""
|
753 | |
Return the Normalized Google distance between words.
|
754 | |
|
755 | |
For more info, refer to:
|
756 | |
https://en.wikipedia.org/wiki/Normalized_Google_distance
|
757 | |
|
758 | |
:param str term1: First term to compare.
|
759 | |
:param str term2: Second term to compare.
|
760 | |
|
761 | |
:rtype: float
|
762 | |
:return: Normalized Google distance between words.
|
763 | |
"""
|
764 | |
|
765 | |
lhits1 = math.log10(hits(term1))
|
766 | |
lhits2 = math.log10(hits(term2))
|
767 | |
lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
|
768 | |
npages = hits('the')
|
769 | |
fix = 1000
|
770 | |
|
771 | |
lN = math.log10(npages * fix)
|
772 | |
numerator = max([lhits1, lhits2]) - lhits_mix
|
773 | |
denomin = lN - min([lhits1, lhits2])
|
774 | |
|
775 | |
return numerator / denomin
|
|
375 |
return next(search(*args, **kwargs))
|