Codebase list python-googlesearch / 19abad4
Update upstream source from tag 'upstream/2.0.3' Update to upstream version '2.0.3' with Debian dir 23f01bae18c25b733c06069de12814c5368a6dfc Sophie Brun 3 years ago
5 changed file(s) with 214 addition(s) and 536 deletion(s). Raw diff Collapse all Expand all
00 Metadata-Version: 1.1
11 Name: google
2 Version: 2.0.2
2 Version: 2.0.3
33 Summary: Python bindings to the Google search engine.
44 Home-page: http://breakingcode.wordpress.com/
55 Author: Mario Vilas
66 Author-email: [email protected]
77 License: UNKNOWN
8 Description: googlesearch
9 ============
10
11 Google search from Python.
12
13 https://python-googlesearch.readthedocs.io/en/latest/
14
15 Usage example
16 -------------
17
18 # Get the first 20 hits for: "Breaking Code" WordPress blog
19 from googlesearch import search
20 for url in search('"Breaking Code" WordPress blog', stop=20):
21 print(url)
22
23 Installing
24 ----------
25
26 pip install google
27
8 Description: UNKNOWN
289 Platform: UNKNOWN
2910 Classifier: Development Status :: 5 - Production/Stable
3011 Classifier: Intended Audience :: Developers
00 Metadata-Version: 1.1
11 Name: google
2 Version: 2.0.2
2 Version: 2.0.3
33 Summary: Python bindings to the Google search engine.
44 Home-page: http://breakingcode.wordpress.com/
55 Author: Mario Vilas
66 Author-email: [email protected]
77 License: UNKNOWN
8 Description: googlesearch
9 ============
10
11 Google search from Python.
12
13 https://python-googlesearch.readthedocs.io/en/latest/
14
15 Usage example
16 -------------
17
18 # Get the first 20 hits for: "Breaking Code" WordPress blog
19 from googlesearch import search
20 for url in search('"Breaking Code" WordPress blog', stop=20):
21 print(url)
22
23 Installing
24 ----------
25
26 pip install google
27
8 Description: UNKNOWN
289 Platform: UNKNOWN
2910 Classifier: Development Status :: 5 - Production/Stable
3011 Classifier: Intended Audience :: Developers
00 #!/usr/bin/env python
11
22 # Python bindings to the Google search engine
3 # Copyright (c) 2009-2018, Mario Vilas
3 # Copyright (c) 2009-2019, Mario Vilas
44 # All rights reserved.
55 #
66 # Redistribution and use in source and binary forms, with or without
3131 import random
3232 import sys
3333 import time
34 import math
3534
3635 if sys.version_info[0] > 2:
3736 from http.cookiejar import LWPCookieJar
6362 # Shortcut for "get lucky" search.
6463 'lucky',
6564
66 # Computations based on the number of Google hits.
67 'hits', 'ngd',
68
6965 # Miscellaneous utility functions.
70 'get_random_user_agent',
66 'get_random_user_agent', 'get_tbs',
7167 ]
7268
7369 # URL templates to make Google searches.
7470 url_home = "https://www.google.%(tld)s/"
7571 url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
76 "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
72 "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
73 "cr=%(country)s"
7774 url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
78 "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
75 "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
76 "cr=%(country)s"
7977 url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
8078 "num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
81 "tbm=%(tpe)s"
79 "tbm=%(tpe)s&cr=%(country)s"
8280 url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
8381 "q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
84 "safe=%(safe)s&tbm=%(tpe)s"
82 "safe=%(safe)s&tbm=%(tpe)s&cr=%(country)s"
83 url_parameters = (
84 'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'tbm', 'cr')
8585
8686 # Cookie jar. Stored at the user's home folder.
87 # If the cookie jar is inaccessible, the errors are ignored.
8788 home_folder = os.getenv('HOME')
8889 if not home_folder:
8990 home_folder = os.getenv('USERHOME')
99100 USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
100101
101102 # Load the list of valid user agents from the install folder.
103 # The search order is:
104 # * user_agents.txt.gz
105 # * user_agents.txt
106 # * default user agent
102107 try:
103108 install_folder = os.path.abspath(os.path.split(__file__)[0])
104109 try:
129134 return random.choice(user_agents_list)
130135
131136
137 # Helper function to format the tbs parameter.
138 def get_tbs(from_date, to_date):
139 """
140 Helper function to format the tbs parameter.
141
142 :param datetime.date from_date: Python date object.
143 :param datetime.date to_date: Python date object.
144
145 :rtype: str
146 :return: Dates encoded in tbs format.
147 """
148 from_date = from_date.strftime('%m/%d/%Y')
149 to_date = to_date.strftime('%m/%d/%Y')
150 return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
151
152
132153 # Request the given URL and return the response page, using the cookie jar.
154 # If the cookie jar is inaccessible, the errors are ignored.
133155 def get_page(url, user_agent=None):
134156 """
135157 Request the given URL and return the response page, using the cookie jar.
166188 def filter_result(link):
167189 try:
168190
169 # Valid results are absolute URLs not pointing to a Google domain
170 # like images.google.com or googleusercontent.com
191 # Decode hidden URLs.
192 if link.startswith('/url?'):
193 o = urlparse(link, 'http')
194 link = parse_qs(o.query)['q'][0]
195
196 # Valid results are absolute URLs not pointing to a Google domain,
197 # like images.google.com or googleusercontent.com for example.
198 # TODO this could be improved!
171199 o = urlparse(link, 'http')
172200 if o.netloc and 'google' not in o.netloc:
173201 return link
174202
175 # Decode hidden URLs.
176 if link.startswith('/url?'):
177 link = parse_qs(o.query)['q'][0]
178
179 # Valid results are absolute URLs not pointing to a Google domain
180 # like images.google.com or googleusercontent.com
181 o = urlparse(link, 'http')
182 if o.netloc and 'google' not in o.netloc:
183 return link
184
185 # Otherwise, or on error, return None.
203 # On error, return None.
186204 except Exception:
187205 pass
188 return None
189206
190207
191208 # Returns a generator that yields URLs.
192209 def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
193 stop=None, domains=None, pause=2.0, only_standard=False,
194 extra_params={}, tpe='', user_agent=None):
210 stop=None, domains=None, pause=2.0, tpe='', country='',
211 extra_params=None, user_agent=None):
195212 """
196213 Search the given query string using Google.
197214
203220 :param str safe: Safe search.
204221 :param int num: Number of results per page.
205222 :param int start: First result to retrieve.
206 :param int or None stop: Last result to retrieve.
223 :param int stop: Last result to retrieve.
207224 Use None to keep searching forever.
208 :param list of str or None domains: A list of web domains to constrain
225 :param list domains: A list of web domains to constrain
209226 the search.
210227 :param float pause: Lapse to wait between HTTP requests.
211228 A lapse too long will make the search slow, but a lapse too short may
212229 cause Google to block your IP. Your mileage may vary!
213 :param bool only_standard: If True, only returns the standard results from
214 each page. If False, it returns every possible link from each page,
215 except for those that point back to Google itself. Defaults to False
216 for backwards compatibility with older versions of this module.
217 :param dict of str to str extra_params: A dictionary of extra HTTP GET
230 :param str tpe: Search type (images, videos, news, shopping, books, apps)
231 Use the following values {videos: 'vid', images: 'isch',
232 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
233 :param str country: Country or region to focus the search on. Similar to
234 changing the TLD, but does not yield exactly the same results.
235 Only Google knows why...
236 :param dict extra_params: A dictionary of extra HTTP GET
218237 parameters, which must be URL encoded. For example if you don't want
219238 Google to filter similar results you can set the extra_params to
220239 {'filter': '0'} which will append '&filter=0' to every query.
221 :param str tpe: Search type (images, videos, news, shopping, books, apps)
222 Use the following values {videos: 'vid', images: 'isch',
223 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
224 :param str or None user_agent: User agent for the HTTP requests.
240 :param str user_agent: User agent for the HTTP requests.
225241 Use None for the default.
226242
227243 :rtype: generator of str
232248 # This is used to avoid repeated results.
233249 hashes = set()
234250
235 # Count the number of links yielded
251 # Count the number of links yielded.
236252 count = 0
237253
238254 # Prepare domain list if it exists.
243259 # Prepare the search string.
244260 query = quote_plus(query)
245261
246 # Check extra_params for overlapping
247 for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
262 # If no extra_params is given, create an empty dictionary.
263 # We should avoid using an empty dictionary as a default value
264 # in a function parameter in Python.
265 if not extra_params:
266 extra_params = {}
267
268 # Check extra_params for overlapping.
269 for builtin_param in url_parameters:
248270 if builtin_param in extra_params.keys():
249271 raise ValueError(
250272 'GET parameter "%s" is overlapping with \
269291
270292 # Loop until we reach the maximum result, if any (otherwise, loop forever).
271293 while not stop or count < stop:
272 # Remeber last count to detect the end of results
294
295 # Remeber last count to detect the end of results.
273296 last_count = count
274297
275 try: # Is it python<3?
276 iter_extra_params = extra_params.iteritems()
277 except AttributeError: # Or python>3?
278 iter_extra_params = extra_params.items()
279 # Append extra GET_parameters to URL
280 for k, v in iter_extra_params:
281 url += url + ('&%s=%s' % (k, v))
298 # Append extra GET parameters to the URL.
299 # This is done on every iteration because we're
300 # rebuilding the entire URL at the end of this loop.
301 for k, v in extra_params.items():
302 k = quote_plus(k)
303 v = quote_plus(v)
304 url = url + ('&%s=%s' % (k, v))
282305
283306 # Sleep between requests.
307 # Keeps Google from banning you for making too many requests.
284308 time.sleep(pause)
285309
286310 # Request the Google Search results page.
287311 html = get_page(url, user_agent)
288312
289 # Parse the response and process every anchored URL.
313 # Parse the response and get every anchored URL.
290314 if is_bs4:
291315 soup = BeautifulSoup(html, 'html.parser')
292316 else:
294318 try:
295319 anchors = soup.find(id='search').findAll('a')
296320 # Sometimes (depending on the User-agent) there is
297 # no id "search" in html response
321 # no id "search" in html response...
298322 except AttributeError:
299 # Remove links of the top bar
323 # Remove links of the top bar.
300324 gbar = soup.find(id='gbar')
301325 if gbar:
302326 gbar.clear()
303327 anchors = soup.findAll('a')
328
329 # Process every anchored URL.
304330 for a in anchors:
305
306 # Leave only the "standard" results if requested.
307 # Otherwise grab all possible links.
308 if only_standard and (
309 not a.parent or a.parent.name.lower() != "h3"):
310 continue
311331
312332 # Get the URL from the anchor tag.
313333 try:
329349 # Yield the result.
330350 yield link
331351
352 # Increase the results counter.
353 # If we reached the limit, stop.
332354 count += 1
333355 if stop and count >= stop:
334356 return
335357
336358 # End if there are no more results.
359 # XXX TODO review this logic, not sure if this is still true!
337360 if last_count == count:
338361 break
339362
347370
348371 # Shortcut to search images.
349372 # Beware, this does not return the image link.
350 def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
351 start=0, stop=None, pause=2.0, domains=None,
352 only_standard=False, extra_params={}):
373 def search_images(*args, **kwargs):
353374 """
354375 Shortcut to search images.
355376
377 Same arguments and return value as the main search function.
378
356379 :note: Beware, this does not return the image link.
357
358 :param str query: Query string. Must NOT be url-encoded.
359 :param str tld: Top level domain.
360 :param str lang: Language.
361 :param str tbs: Time limits (i.e "qdr:h" => last hour,
362 "qdr:d" => last 24 hours, "qdr:m" => last month).
363 :param str safe: Safe search.
364 :param int num: Number of results per page.
365 :param int start: First result to retrieve.
366 :param int or None stop: Last result to retrieve.
367 Use None to keep searching forever.
368 :param list of str or None domains: A list of web domains to constrain
369 the search.
370 :param float pause: Lapse to wait between HTTP requests.
371 A lapse too long will make the search slow, but a lapse too short may
372 cause Google to block your IP. Your mileage may vary!
373 :param bool only_standard: If True, only returns the standard results from
374 each page. If False, it returns every possible link from each page,
375 except for those that point back to Google itself. Defaults to False
376 for backwards compatibility with older versions of this module.
377 :param dict of str to str extra_params: A dictionary of extra HTTP GET
378 parameters, which must be URL encoded. For example if you don't want
379 Google to filter similar results you can set the extra_params to
380 {'filter': '0'} which will append '&filter=0' to every query.
381 :param str tpe: Search type (images, videos, news, shopping, books, apps)
382 Use the following values {videos: 'vid', images: 'isch',
383 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
384 :param str or None user_agent: User agent for the HTTP requests.
385 Use None for the default.
386
387 :rtype: generator of str
388 :return: Generator (iterator) that yields found URLs.
389 If the stop parameter is None the iterator will loop forever.
390 """
391 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
392 pause, only_standard, extra_params, tpe='isch')
380 """
381 kwargs['tpe'] = 'isch'
382 return search(*args, **kwargs)
393383
394384
395385 # Shortcut to search news.
396 def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
397 start=0, stop=None, domains=None, pause=2.0,
398 only_standard=False, extra_params={}):
386 def search_news(*args, **kwargs):
399387 """
400388 Shortcut to search news.
401389
402 :param str query: Query string. Must NOT be url-encoded.
403 :param str tld: Top level domain.
404 :param str lang: Language.
405 :param str tbs: Time limits (i.e "qdr:h" => last hour,
406 "qdr:d" => last 24 hours, "qdr:m" => last month).
407 :param str safe: Safe search.
408 :param int num: Number of results per page.
409 :param int start: First result to retrieve.
410 :param int or None stop: Last result to retrieve.
411 Use None to keep searching forever.
412 :param list of str or None domains: A list of web domains to constrain
413 the search.
414 :param float pause: Lapse to wait between HTTP requests.
415 A lapse too long will make the search slow, but a lapse too short may
416 cause Google to block your IP. Your mileage may vary!
417 :param bool only_standard: If True, only returns the standard results from
418 each page. If False, it returns every possible link from each page,
419 except for those that point back to Google itself. Defaults to False
420 for backwards compatibility with older versions of this module.
421 :param dict of str to str extra_params: A dictionary of extra HTTP GET
422 parameters, which must be URL encoded. For example if you don't want
423 Google to filter similar results you can set the extra_params to
424 {'filter': '0'} which will append '&filter=0' to every query.
425 :param str tpe: Search type (images, videos, news, shopping, books, apps)
426 Use the following values {videos: 'vid', images: 'isch',
427 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
428 :param str or None user_agent: User agent for the HTTP requests.
429 Use None for the default.
430
431 :rtype: generator of str
432 :return: Generator (iterator) that yields found URLs.
433 If the stop parameter is None the iterator will loop forever.
434 """
435 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
436 pause, only_standard, extra_params, tpe='nws')
390 Same arguments and return value as the main search function.
391 """
392 kwargs['tpe'] = 'nws'
393 return search(*args, **kwargs)
437394
438395
439396 # Shortcut to search videos.
440 def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
441 start=0, stop=None, domains=None, pause=2.0,
442 only_standard=False, extra_params={}):
397 def search_videos(*args, **kwargs):
443398 """
444399 Shortcut to search videos.
445400
446 :param str query: Query string. Must NOT be url-encoded.
447 :param str tld: Top level domain.
448 :param str lang: Language.
449 :param str tbs: Time limits (i.e "qdr:h" => last hour,
450 "qdr:d" => last 24 hours, "qdr:m" => last month).
451 :param str safe: Safe search.
452 :param int num: Number of results per page.
453 :param int start: First result to retrieve.
454 :param int or None stop: Last result to retrieve.
455 Use None to keep searching forever.
456 :param list of str or None domains: A list of web domains to constrain
457 the search.
458 :param float pause: Lapse to wait between HTTP requests.
459 A lapse too long will make the search slow, but a lapse too short may
460 cause Google to block your IP. Your mileage may vary!
461 :param bool only_standard: If True, only returns the standard results from
462 each page. If False, it returns every possible link from each page,
463 except for those that point back to Google itself. Defaults to False
464 for backwards compatibility with older versions of this module.
465 :param dict of str to str extra_params: A dictionary of extra HTTP GET
466 parameters, which must be URL encoded. For example if you don't want
467 Google to filter similar results you can set the extra_params to
468 {'filter': '0'} which will append '&filter=0' to every query.
469 :param str tpe: Search type (images, videos, news, shopping, books, apps)
470 Use the following values {videos: 'vid', images: 'isch',
471 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
472 :param str or None user_agent: User agent for the HTTP requests.
473 Use None for the default.
474
475 :rtype: generator of str
476 :return: Generator (iterator) that yields found URLs.
477 If the stop parameter is None the iterator will loop forever.
478 """
479 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
480 pause, only_standard, extra_params, tpe='vid')
401 Same arguments and return value as the main search function.
402 """
403 kwargs['tpe'] = 'vid'
404 return search(*args, **kwargs)
481405
482406
483407 # Shortcut to search shop.
484 def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
485 start=0, stop=None, domains=None, pause=2.0,
486 only_standard=False, extra_params={}):
408 def search_shop(*args, **kwargs):
487409 """
488410 Shortcut to search shop.
489411
490 :param str query: Query string. Must NOT be url-encoded.
491 :param str tld: Top level domain.
492 :param str lang: Language.
493 :param str tbs: Time limits (i.e "qdr:h" => last hour,
494 "qdr:d" => last 24 hours, "qdr:m" => last month).
495 :param str safe: Safe search.
496 :param int num: Number of results per page.
497 :param int start: First result to retrieve.
498 :param int or None stop: Last result to retrieve.
499 Use None to keep searching forever.
500 :param list of str or None domains: A list of web domains to constrain
501 the search.
502 :param float pause: Lapse to wait between HTTP requests.
503 A lapse too long will make the search slow, but a lapse too short may
504 cause Google to block your IP. Your mileage may vary!
505 :param bool only_standard: If True, only returns the standard results from
506 each page. If False, it returns every possible link from each page,
507 except for those that point back to Google itself. Defaults to False
508 for backwards compatibility with older versions of this module.
509 :param dict of str to str extra_params: A dictionary of extra HTTP GET
510 parameters, which must be URL encoded. For example if you don't want
511 Google to filter similar results you can set the extra_params to
512 {'filter': '0'} which will append '&filter=0' to every query.
513 :param str tpe: Search type (images, videos, news, shopping, books, apps)
514 Use the following values {videos: 'vid', images: 'isch',
515 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
516 :param str or None user_agent: User agent for the HTTP requests.
517 Use None for the default.
518
519 :rtype: generator of str
520 :return: Generator (iterator) that yields found URLs.
521 If the stop parameter is None the iterator will loop forever.
522 """
523 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
524 pause, only_standard, extra_params, tpe='shop')
412 Same arguments and return value as the main search function.
413 """
414 kwargs['tpe'] = 'shop'
415 return search(*args, **kwargs)
525416
526417
527418 # Shortcut to search books.
528 def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
529 start=0, stop=None, domains=None, pause=2.0,
530 only_standard=False, extra_params={}):
419 def search_books(*args, **kwargs):
531420 """
532421 Shortcut to search books.
533422
534 :param str query: Query string. Must NOT be url-encoded.
535 :param str tld: Top level domain.
536 :param str lang: Language.
537 :param str tbs: Time limits (i.e "qdr:h" => last hour,
538 "qdr:d" => last 24 hours, "qdr:m" => last month).
539 :param str safe: Safe search.
540 :param int num: Number of results per page.
541 :param int start: First result to retrieve.
542 :param int or None stop: Last result to retrieve.
543 Use None to keep searching forever.
544 :param list of str or None domains: A list of web domains to constrain
545 the search.
546 :param float pause: Lapse to wait between HTTP requests.
547 A lapse too long will make the search slow, but a lapse too short may
548 cause Google to block your IP. Your mileage may vary!
549 :param bool only_standard: If True, only returns the standard results from
550 each page. If False, it returns every possible link from each page,
551 except for those that point back to Google itself. Defaults to False
552 for backwards compatibility with older versions of this module.
553 :param dict of str to str extra_params: A dictionary of extra HTTP GET
554 parameters, which must be URL encoded. For example if you don't want
555 Google to filter similar results you can set the extra_params to
556 {'filter': '0'} which will append '&filter=0' to every query.
557 :param str tpe: Search type (images, videos, news, shopping, books, apps)
558 Use the following values {videos: 'vid', images: 'isch',
559 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
560 :param str or None user_agent: User agent for the HTTP requests.
561 Use None for the default.
562
563 :rtype: generator of str
564 :return: Generator (iterator) that yields found URLs.
565 If the stop parameter is None the iterator will loop forever.
566 """
567 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
568 pause, only_standard, extra_params, tpe='bks')
423 Same arguments and return value as the main search function.
424 """
425 kwargs['tpe'] = 'bks'
426 return search(*args, **kwargs)
569427
570428
571429 # Shortcut to search apps.
572 def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
573 start=0, stop=None, domains=None, pause=2.0,
574 only_standard=False, extra_params={}):
430 def search_apps(*args, **kwargs):
575431 """
576432 Shortcut to search apps.
577433
578 :param str query: Query string. Must NOT be url-encoded.
579 :param str tld: Top level domain.
580 :param str lang: Language.
581 :param str tbs: Time limits (i.e "qdr:h" => last hour,
582 "qdr:d" => last 24 hours, "qdr:m" => last month).
583 :param str safe: Safe search.
584 :param int num: Number of results per page.
585 :param int start: First result to retrieve.
586 :param int or None stop: Last result to retrieve.
587 Use None to keep searching forever.
588 :param list of str or None domains: A list of web domains to constrain
589 the search.
590 :param float pause: Lapse to wait between HTTP requests.
591 A lapse too long will make the search slow, but a lapse too short may
592 cause Google to block your IP. Your mileage may vary!
593 :param bool only_standard: If True, only returns the standard results from
594 each page. If False, it returns every possible link from each page,
595 except for those that point back to Google itself. Defaults to False
596 for backwards compatibility with older versions of this module.
597 :param dict of str to str extra_params: A dictionary of extra HTTP GET
598 parameters, which must be URL encoded. For example if you don't want
599 Google to filter similar results you can set the extra_params to
600 {'filter': '0'} which will append '&filter=0' to every query.
601 :param str tpe: Search type (images, videos, news, shopping, books, apps)
602 Use the following values {videos: 'vid', images: 'isch',
603 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
604 :param str or None user_agent: User agent for the HTTP requests.
605 Use None for the default.
606
607 :rtype: generator of str
608 :return: Generator (iterator) that yields found URLs.
609 If the stop parameter is None the iterator will loop forever.
610 """
611 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
612 pause, only_standard, extra_params, tpe='app')
434 Same arguments and return value as the main search function.
435 """
436 kwargs['tpe'] = 'app'
437 return search(*args, **kwargs)
613438
614439
615440 # Shortcut to single-item search.
616441 # Evaluates the iterator to return the single URL as a string.
617 def lucky(query, tld='com', lang='en', tbs='0', safe='off',
618 only_standard=False, extra_params={}, tpe=''):
442 def lucky(*args, **kwargs):
619443 """
620444 Shortcut to single-item search.
621445
622 :param str query: Query string. Must NOT be url-encoded.
623 :param str tld: Top level domain.
624 :param str lang: Language.
625 :param str tbs: Time limits (i.e "qdr:h" => last hour,
626 "qdr:d" => last 24 hours, "qdr:m" => last month).
627 :param str safe: Safe search.
628 :param int num: Number of results per page.
629 :param int start: First result to retrieve.
630 :param int or None stop: Last result to retrieve.
631 Use None to keep searching forever.
632 :param list of str or None domains: A list of web domains to constrain
633 the search.
634 :param float pause: Lapse to wait between HTTP requests.
635 A lapse too long will make the search slow, but a lapse too short may
636 cause Google to block your IP. Your mileage may vary!
637 :param bool only_standard: If True, only returns the standard results from
638 each page. If False, it returns every possible link from each page,
639 except for those that point back to Google itself. Defaults to False
640 for backwards compatibility with older versions of this module.
641 :param dict of str to str extra_params: A dictionary of extra HTTP GET
642 parameters, which must be URL encoded. For example if you don't want
643 Google to filter similar results you can set the extra_params to
644 {'filter': '0'} which will append '&filter=0' to every query.
645 :param str tpe: Search type (images, videos, news, shopping, books, apps)
646 Use the following values {videos: 'vid', images: 'isch',
647 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
648 :param str or None user_agent: User agent for the HTTP requests.
649 Use None for the default.
446 Same arguments as the main search function, but the return value changes.
650447
651448 :rtype: str
652449 :return: URL found by Google.
653450 """
654 gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
655 extra_params, tpe)
656 return next(gen)
657
658
659 # Returns only the number of Google hits for the given search query.
660 # This is the number reported by Google itself, NOT by scraping.
661 def hits(query, tld='com', lang='en', tbs='0', safe='off',
662 domains=None, extra_params={}, tpe='', user_agent=None):
663 """
664 Search the given query string using Google and return the number of hits.
665
666 :note: This is the number reported by Google itself, NOT by scraping.
667
668 :param str query: Query string. Must NOT be url-encoded.
669 :param str tld: Top level domain.
670 :param str lang: Language.
671 :param str tbs: Time limits (i.e "qdr:h" => last hour,
672 "qdr:d" => last 24 hours, "qdr:m" => last month).
673 :param str safe: Safe search.
674 :param int num: Number of results per page.
675 :param int start: First result to retrieve.
676 :param int or None stop: Last result to retrieve.
677 Use None to keep searching forever.
678 :param list of str or None domains: A list of web domains to constrain
679 the search.
680 :param float pause: Lapse to wait between HTTP requests.
681 A lapse too long will make the search slow, but a lapse too short may
682 cause Google to block your IP. Your mileage may vary!
683 :param bool only_standard: If True, only returns the standard results from
684 each page. If False, it returns every possible link from each page,
685 except for those that point back to Google itself. Defaults to False
686 for backwards compatibility with older versions of this module.
687 :param dict of str to str extra_params: A dictionary of extra HTTP GET
688 parameters, which must be URL encoded. For example if you don't want
689 Google to filter similar results you can set the extra_params to
690 {'filter': '0'} which will append '&filter=0' to every query.
691 :param str tpe: Search type (images, videos, news, shopping, books, apps)
692 Use the following values {videos: 'vid', images: 'isch',
693 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
694 :param str or None user_agent: User agent for the HTTP requests.
695 Use None for the default.
696
697 :rtype: int
698 :return: Number of Google hits for the given search query.
699 """
700
701 # Prepare domain list if it exists.
702 if domains:
703 domain_query = '+OR+'.join('site:' + domain for domain in domains)
704 domain_query = '+' + domain_query
705 else:
706 domain_query = ''
707
708 # Prepare the search string.
709 query = quote_plus(query + domain_query)
710
711 # Check extra_params for overlapping
712 for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
713 if builtin_param in extra_params.keys():
714 raise ValueError(
715 'GET parameter "%s" is overlapping with \
716 the built-in GET parameter',
717 builtin_param
718 )
719
720 # Grab the cookie from the home page.
721 get_page(url_home % vars(), user_agent)
722
723 # Prepare the URL of the first (and in this cases ONLY) request.
724 url = url_search % vars()
725
726 try: # Is it python<3?
727 iter_extra_params = extra_params.iteritems()
728 except AttributeError: # Or python>3?
729 iter_extra_params = extra_params.items()
730 # Append extra GET_parameters to URL
731 for k, v in iter_extra_params:
732 url += url + ('&%s=%s' % (k, v))
733
734 # Request the Google Search results page.
735 html = get_page(url, user_agent)
736
737 # Parse the response.
738 if is_bs4:
739 soup = BeautifulSoup(html, 'html.parser')
740 else:
741 soup = BeautifulSoup(html)
742
743 # Get the number of hits.
744 tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
745 hits_text_parts = tag.text.split()
746 if len(hits_text_parts) < 3:
747 return 0
748 return int(hits_text_parts[1].replace(',', '').replace('.', ''))
749
750
751 def ngd(term1, term2):
752 """
753 Return the Normalized Google distance between words.
754
755 For more info, refer to:
756 https://en.wikipedia.org/wiki/Normalized_Google_distance
757
758 :param str term1: First term to compare.
759 :param str term2: Second term to compare.
760
761 :rtype: float
762 :return: Normalized Google distance between words.
763 """
764
765 lhits1 = math.log10(hits(term1))
766 lhits2 = math.log10(hits(term2))
767 lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
768 npages = hits('the')
769 fix = 1000
770
771 lN = math.log10(npages * fix)
772 numerator = max([lhits1, lhits2]) - lhits_mix
773 denomin = lN - min([lhits1, lhits2])
774
775 return numerator / denomin
451 return next(search(*args, **kwargs))
00 #!/usr/bin/env python
11
22 # Python bindings to the Google search engine
3 # Copyright (c) 2009-2016, Mario Vilas
3 # Copyright (c) 2009-2019, Mario Vilas
44 # All rights reserved.
55 #
66 # Redistribution and use in source and binary forms, with or without
3131
3232 from googlesearch import search, get_random_user_agent
3333
34 # TODO port to argparse
3435 from optparse import OptionParser, IndentedHelpFormatter
36
3537
3638 class BannerHelpFormatter(IndentedHelpFormatter):
3739
4547 msg = IndentedHelpFormatter.format_usage(self, usage)
4648 return '%s\n%s' % (self.banner, msg)
4749
48 # Parse the command line arguments.
49 formatter = BannerHelpFormatter(
50 "Python script to use the Google search engine\n"
51 "By Mario Vilas (mvilas at gmail dot com)\n"
52 "https://github.com/MarioVilas/googlesearch\n"
53 )
54 parser = OptionParser(formatter=formatter)
55 parser.set_usage("%prog [options] query")
56 parser.add_option("--tld", metavar="TLD", type="string", default="com",
57 help="top level domain to use [default: com]")
58 parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
59 help="produce results in the given language [default: en]")
60 parser.add_option("--domains", metavar="DOMAINS", type="string", default="",
61 help="comma separated list of domains to constrain the search to")
62 parser.add_option("--tbs", metavar="TBS", type="string", default="0",
63 help="produce results from period [default: 0]")
64 parser.add_option("--safe", metavar="SAFE", type="string", default="off",
65 help="kids safe search [default: off]")
66 parser.add_option("--num", metavar="NUMBER", type="int", default=10,
67 help="number of results per page [default: 10]")
68 parser.add_option("--start", metavar="NUMBER", type="int", default=0,
69 help="first result to retrieve [default: 0]")
70 parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
71 help="last result to retrieve [default: unlimited]")
72 parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
73 help="pause between HTTP requests [default: 2.0]")
74 parser.add_option("--rua", metavar="USERAGENT", action="store_true", default=False,
75 help="Randomize the User-Agent [default: no]")
76 parser.add_option("--all", dest="only_standard",
77 action="store_false", default=True,
78 help="grab all possible links from result pages [default: only standard results]")
79 (options, args) = parser.parse_args()
80 query = ' '.join(args)
81 if not query:
82 parser.print_help()
83 sys.exit(2)
84 params = [(k, v) for (k, v) in options.__dict__.items() if not k.startswith('_')]
85 params = dict(params)
8650
87 # Split the comma separated list of domains, if present.
88 if 'domains' in params:
89 params['domains'] = [x.strip() for x in params['domains'].split(',')]
51 def main():
9052
91 # Randomize the user agent if requested.
92 if "rua" in params:
93 rua = params.pop("rua")
94 if rua:
95 params["user_agent"] = get_random_user_agent()
53 # Parse the command line arguments.
54 formatter = BannerHelpFormatter(
55 "Python script to use the Google search engine\n"
56 "By Mario Vilas (mvilas at gmail dot com)\n"
57 "https://github.com/MarioVilas/googlesearch\n"
58 )
59 parser = OptionParser(formatter=formatter)
60 parser.set_usage("%prog [options] query")
61 parser.add_option(
62 '--tld', metavar='TLD', type='string', default='com',
63 help="top level domain to use [default: com]")
64 parser.add_option(
65 '--lang', metavar='LANGUAGE', type='string', default='en',
66 help="produce results in the given language [default: en]")
67 parser.add_option(
68 '--domains', metavar='DOMAINS', type='string', default='',
69 help="comma separated list of domains to constrain the search to")
70 parser.add_option(
71 '--tbs', metavar='TBS', type='string', default='0',
72 help="produce results from period [default: 0]")
73 parser.add_option(
74 '--safe', metavar='SAFE', type='string', default='off',
75 help="kids safe search [default: off]")
76 parser.add_option(
77 '--type', metavar='TYPE', type='string', default='search', dest='tpe',
78 help="search type (search, images, videos, news, shopping, books,"
79 " apps) [default: search]")
80 parser.add_option(
81 '--country', metavar='COUNTRY', type='string', default='',
82 help="region to restrict search on [default: not restricted]")
83 parser.add_option(
84 '--num', metavar='NUMBER', type='int', default=10,
85 help="number of results per page [default: 10]")
86 parser.add_option(
87 '--start', metavar='NUMBER', type='int', default=0,
88 help="first result to retrieve [default: 0]")
89 parser.add_option(
90 '--stop', metavar='NUMBER', type='int', default=0,
91 help="last result to retrieve [default: unlimited]")
92 parser.add_option(
93 '--pause', metavar='SECONDS', type='float', default=2.0,
94 help="pause between HTTP requests [default: 2.0]")
95 parser.add_option(
96 '--rua', metavar='USERAGENT', action='store_true', default=False,
97 help="Randomize the User-Agent [default: no]")
98 (options, args) = parser.parse_args()
99 query = ' '.join(args)
100 if not query:
101 parser.print_help()
102 sys.exit(2)
103 params = [
104 (k, v) for (k, v) in options.__dict__.items()
105 if not k.startswith('_')]
106 params = dict(params)
96107
97 # Run the query.
98 for url in search(query, **params):
99 print(url)
100 try:
101 sys.stdout.flush()
102 except:
103 pass
108 # Split the comma separated list of domains, if present.
109 if 'domains' in params:
110 params['domains'] = [x.strip() for x in params['domains'].split(',')]
111
112 # Use a special search type if requested.
113 if 'tpe' in params:
114 tpe = params['tpe']
115 if tpe and tpe not in (
116 'search', 'images', 'videos', 'news',
117 'shopping', 'books', 'apps'):
118 parser.error("invalid type: %r" % tpe)
119 if tpe == 'search':
120 params['tpe'] = ''
121
122 # Randomize the user agent if requested.
123 if 'rua' in params and params.pop('rua'):
124 params['user_agent'] = get_random_user_agent()
125
126 # Run the query.
127 for url in search(query, **params):
128 print(url)
129 try:
130 sys.stdout.flush()
131 except Exception:
132 pass
133
134
135 if __name__ == '__main__':
136 main()
4343 scripts=[join('scripts', 'google')],
4444 package_data={'googlesearch': ['user_agents.txt.gz']},
4545 include_package_data=True,
46 version="2.0.2",
46 version="2.0.3",
4747 description="Python bindings to the Google search engine.",
4848 author="Mario Vilas",
4949 author_email="[email protected]",
6565 metadata['install_requires'] = metadata['requires']
6666 except ImportError:
6767 from distutils.core import setup
68
68 """
6969 # Get the long description from the readme file.
7070 try:
7171 metadata['long_description'] = open(join(here, 'README.md'), 'rU').read()
7272 except Exception:
7373 pass
7474
75 # If twine is installed, set the long description content type.
76 try:
77 import twine
78 metadata['long_description_content_type'] = 'text/markdown'
79 except ImportError:
80 pass
81 """
7582 # Run the setup script.
7683 setup(**metadata)