Codebase list python-googlesearch / c9fea811-ed52-471e-b681-9f6bdd469bed/upstream
Import upstream version 3.0.0 Kali Janitor 3 years ago
5 changed file(s) with 168 addition(s) and 598 deletion(s). Raw diff Collapse all Expand all
00 Metadata-Version: 1.1
11 Name: google
2 Version: 2.0.2
2 Version: 3.0.0
33 Summary: Python bindings to the Google search engine.
44 Home-page: http://breakingcode.wordpress.com/
55 Author: Mario Vilas
66 Author-email: [email protected]
77 License: UNKNOWN
8 Description: googlesearch
9 ============
10
11 Google search from Python.
12
13 https://python-googlesearch.readthedocs.io/en/latest/
14
15 Usage example
16 -------------
17
18 # Get the first 20 hits for: "Breaking Code" WordPress blog
19 from googlesearch import search
20 for url in search('"Breaking Code" WordPress blog', stop=20):
21 print(url)
22
23 Installing
24 ----------
25
26 pip install google
27
8 Description: UNKNOWN
289 Platform: UNKNOWN
2910 Classifier: Development Status :: 5 - Production/Stable
3011 Classifier: Intended Audience :: Developers
00 Metadata-Version: 1.1
11 Name: google
2 Version: 2.0.2
2 Version: 3.0.0
33 Summary: Python bindings to the Google search engine.
44 Home-page: http://breakingcode.wordpress.com/
55 Author: Mario Vilas
66 Author-email: [email protected]
77 License: UNKNOWN
8 Description: googlesearch
9 ============
10
11 Google search from Python.
12
13 https://python-googlesearch.readthedocs.io/en/latest/
14
15 Usage example
16 -------------
17
18 # Get the first 20 hits for: "Breaking Code" WordPress blog
19 from googlesearch import search
20 for url in search('"Breaking Code" WordPress blog', stop=20):
21 print(url)
22
23 Installing
24 ----------
25
26 pip install google
27
8 Description: UNKNOWN
289 Platform: UNKNOWN
2910 Classifier: Development Status :: 5 - Production/Stable
3011 Classifier: Intended Audience :: Developers
00 #!/usr/bin/env python
11
2 # Python bindings to the Google search engine
3 # Copyright (c) 2009-2018, Mario Vilas
2 # Copyright (c) 2009-2020, Mario Vilas
43 # All rights reserved.
54 #
65 # Redistribution and use in source and binary forms, with or without
3130 import random
3231 import sys
3332 import time
34 import math
33 import ssl
3534
3635 if sys.version_info[0] > 2:
3736 from http.cookiejar import LWPCookieJar
5554 # Main search function.
5655 'search',
5756
58 # Specialized search functions.
59 'search_images', 'search_news',
60 'search_videos', 'search_shop',
61 'search_books', 'search_apps',
62
6357 # Shortcut for "get lucky" search.
6458 'lucky',
6559
66 # Computations based on the number of Google hits.
67 'hits', 'ngd',
68
6960 # Miscellaneous utility functions.
70 'get_random_user_agent',
61 'get_random_user_agent', 'get_tbs',
7162 ]
7263
7364 # URL templates to make Google searches.
7465 url_home = "https://www.google.%(tld)s/"
7566 url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
76 "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
67 "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
68 "cr=%(country)s"
7769 url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
78 "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
70 "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
71 "cr=%(country)s"
7972 url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
8073 "num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
81 "tbm=%(tpe)s"
74 "cr=%(country)s"
8275 url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
8376 "q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
84 "safe=%(safe)s&tbm=%(tpe)s"
77 "safe=%(safe)s&cr=%(country)s"
78 url_parameters = (
79 'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
8580
8681 # Cookie jar. Stored at the user's home folder.
82 # If the cookie jar is inaccessible, the errors are ignored.
8783 home_folder = os.getenv('HOME')
8884 if not home_folder:
8985 home_folder = os.getenv('USERHOME')
9995 USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
10096
10197 # Load the list of valid user agents from the install folder.
98 # The search order is:
99 # * user_agents.txt.gz
100 # * user_agents.txt
101 # * default user agent
102102 try:
103103 install_folder = os.path.abspath(os.path.split(__file__)[0])
104104 try:
129129 return random.choice(user_agents_list)
130130
131131
132 # Helper function to format the tbs parameter.
133 def get_tbs(from_date, to_date):
134 """
135 Helper function to format the tbs parameter.
136
137 :param datetime.date from_date: Python date object.
138 :param datetime.date to_date: Python date object.
139
140 :rtype: str
141 :return: Dates encoded in tbs format.
142 """
143 from_date = from_date.strftime('%m/%d/%Y')
144 to_date = to_date.strftime('%m/%d/%Y')
145 return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
146
147
132148 # Request the given URL and return the response page, using the cookie jar.
133 def get_page(url, user_agent=None):
149 # If the cookie jar is inaccessible, the errors are ignored.
150 def get_page(url, user_agent=None, verify_ssl=True):
134151 """
135152 Request the given URL and return the response page, using the cookie jar.
136153
137154 :param str url: URL to retrieve.
138155 :param str user_agent: User agent for the HTTP requests.
139156 Use None for the default.
157 :param bool verify_ssl: Verify the SSL certificate to prevent
158 traffic interception attacks. Defaults to True.
140159
141160 :rtype: str
142161 :return: Web page retrieved for the given URL.
150169 request = Request(url)
151170 request.add_header('User-Agent', user_agent)
152171 cookie_jar.add_cookie_header(request)
153 response = urlopen(request)
172 if verify_ssl:
173 response = urlopen(request)
174 else:
175 context = ssl._create_unverified_context()
176 response = urlopen(request, context=context)
154177 cookie_jar.extract_cookies(response, request)
155178 html = response.read()
156179 response.close()
166189 def filter_result(link):
167190 try:
168191
169 # Valid results are absolute URLs not pointing to a Google domain
170 # like images.google.com or googleusercontent.com
192 # Decode hidden URLs.
193 if link.startswith('/url?'):
194 o = urlparse(link, 'http')
195 link = parse_qs(o.query)['q'][0]
196
197 # Valid results are absolute URLs not pointing to a Google domain,
198 # like images.google.com or googleusercontent.com for example.
199 # TODO this could be improved!
171200 o = urlparse(link, 'http')
172201 if o.netloc and 'google' not in o.netloc:
173202 return link
174203
175 # Decode hidden URLs.
176 if link.startswith('/url?'):
177 link = parse_qs(o.query)['q'][0]
178
179 # Valid results are absolute URLs not pointing to a Google domain
180 # like images.google.com or googleusercontent.com
181 o = urlparse(link, 'http')
182 if o.netloc and 'google' not in o.netloc:
183 return link
184
185 # Otherwise, or on error, return None.
204 # On error, return None.
186205 except Exception:
187206 pass
188 return None
189207
190208
191209 # Returns a generator that yields URLs.
192210 def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
193 stop=None, domains=None, pause=2.0, only_standard=False,
194 extra_params={}, tpe='', user_agent=None):
211 stop=None, pause=2.0, country='', extra_params=None,
212 user_agent=None, verify_ssl=True):
195213 """
196214 Search the given query string using Google.
197215
203221 :param str safe: Safe search.
204222 :param int num: Number of results per page.
205223 :param int start: First result to retrieve.
206 :param int or None stop: Last result to retrieve.
224 :param int stop: Last result to retrieve.
207225 Use None to keep searching forever.
208 :param list of str or None domains: A list of web domains to constrain
209 the search.
210226 :param float pause: Lapse to wait between HTTP requests.
211227 A lapse too long will make the search slow, but a lapse too short may
212228 cause Google to block your IP. Your mileage may vary!
213 :param bool only_standard: If True, only returns the standard results from
214 each page. If False, it returns every possible link from each page,
215 except for those that point back to Google itself. Defaults to False
216 for backwards compatibility with older versions of this module.
217 :param dict of str to str extra_params: A dictionary of extra HTTP GET
229 :param str country: Country or region to focus the search on. Similar to
230 changing the TLD, but does not yield exactly the same results.
231 Only Google knows why...
232 :param dict extra_params: A dictionary of extra HTTP GET
218233 parameters, which must be URL encoded. For example if you don't want
219234 Google to filter similar results you can set the extra_params to
220235 {'filter': '0'} which will append '&filter=0' to every query.
221 :param str tpe: Search type (images, videos, news, shopping, books, apps)
222 Use the following values {videos: 'vid', images: 'isch',
223 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
224 :param str or None user_agent: User agent for the HTTP requests.
236 :param str user_agent: User agent for the HTTP requests.
225237 Use None for the default.
238 :param bool verify_ssl: Verify the SSL certificate to prevent
239 traffic interception attacks. Defaults to True.
226240
227241 :rtype: generator of str
228242 :return: Generator (iterator) that yields found URLs.
232246 # This is used to avoid repeated results.
233247 hashes = set()
234248
235 # Count the number of links yielded
249 # Count the number of links yielded.
236250 count = 0
237
238 # Prepare domain list if it exists.
239 if domains:
240 query = query + ' ' + ' OR '.join(
241 'site:' + domain for domain in domains)
242251
243252 # Prepare the search string.
244253 query = quote_plus(query)
245254
246 # Check extra_params for overlapping
247 for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
255 # If no extra_params is given, create an empty dictionary.
256 # We should avoid using an empty dictionary as a default value
257 # in a function parameter in Python.
258 if not extra_params:
259 extra_params = {}
260
261 # Check extra_params for overlapping.
262 for builtin_param in url_parameters:
248263 if builtin_param in extra_params.keys():
249264 raise ValueError(
250265 'GET parameter "%s" is overlapping with \
253268 )
254269
255270 # Grab the cookie from the home page.
256 get_page(url_home % vars(), user_agent)
271 get_page(url_home % vars(), user_agent, verify_ssl)
257272
258273 # Prepare the URL of the first request.
259274 if start:
269284
270285 # Loop until we reach the maximum result, if any (otherwise, loop forever).
271286 while not stop or count < stop:
272 # Remeber last count to detect the end of results
287
288 # Remeber last count to detect the end of results.
273289 last_count = count
274290
275 try: # Is it python<3?
276 iter_extra_params = extra_params.iteritems()
277 except AttributeError: # Or python>3?
278 iter_extra_params = extra_params.items()
279 # Append extra GET_parameters to URL
280 for k, v in iter_extra_params:
281 url += url + ('&%s=%s' % (k, v))
291 # Append extra GET parameters to the URL.
292 # This is done on every iteration because we're
293 # rebuilding the entire URL at the end of this loop.
294 for k, v in extra_params.items():
295 k = quote_plus(k)
296 v = quote_plus(v)
297 url = url + ('&%s=%s' % (k, v))
282298
283299 # Sleep between requests.
300 # Keeps Google from banning you for making too many requests.
284301 time.sleep(pause)
285302
286303 # Request the Google Search results page.
287 html = get_page(url, user_agent)
288
289 # Parse the response and process every anchored URL.
304 html = get_page(url, user_agent, verify_ssl)
305
306 # Parse the response and get every anchored URL.
290307 if is_bs4:
291308 soup = BeautifulSoup(html, 'html.parser')
292309 else:
294311 try:
295312 anchors = soup.find(id='search').findAll('a')
296313 # Sometimes (depending on the User-agent) there is
297 # no id "search" in html response
314 # no id "search" in html response...
298315 except AttributeError:
299 # Remove links of the top bar
316 # Remove links of the top bar.
300317 gbar = soup.find(id='gbar')
301318 if gbar:
302319 gbar.clear()
303320 anchors = soup.findAll('a')
321
322 # Process every anchored URL.
304323 for a in anchors:
305
306 # Leave only the "standard" results if requested.
307 # Otherwise grab all possible links.
308 if only_standard and (
309 not a.parent or a.parent.name.lower() != "h3"):
310 continue
311324
312325 # Get the URL from the anchor tag.
313326 try:
329342 # Yield the result.
330343 yield link
331344
345 # Increase the results counter.
346 # If we reached the limit, stop.
332347 count += 1
333348 if stop and count >= stop:
334349 return
335350
336351 # End if there are no more results.
352 # XXX TODO review this logic, not sure if this is still true!
337353 if last_count == count:
338354 break
339355
345361 url = url_next_page_num % vars()
346362
347363
348 # Shortcut to search images.
349 # Beware, this does not return the image link.
350 def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
351 start=0, stop=None, pause=2.0, domains=None,
352 only_standard=False, extra_params={}):
353 """
354 Shortcut to search images.
355
356 :note: Beware, this does not return the image link.
357
358 :param str query: Query string. Must NOT be url-encoded.
359 :param str tld: Top level domain.
360 :param str lang: Language.
361 :param str tbs: Time limits (i.e "qdr:h" => last hour,
362 "qdr:d" => last 24 hours, "qdr:m" => last month).
363 :param str safe: Safe search.
364 :param int num: Number of results per page.
365 :param int start: First result to retrieve.
366 :param int or None stop: Last result to retrieve.
367 Use None to keep searching forever.
368 :param list of str or None domains: A list of web domains to constrain
369 the search.
370 :param float pause: Lapse to wait between HTTP requests.
371 A lapse too long will make the search slow, but a lapse too short may
372 cause Google to block your IP. Your mileage may vary!
373 :param bool only_standard: If True, only returns the standard results from
374 each page. If False, it returns every possible link from each page,
375 except for those that point back to Google itself. Defaults to False
376 for backwards compatibility with older versions of this module.
377 :param dict of str to str extra_params: A dictionary of extra HTTP GET
378 parameters, which must be URL encoded. For example if you don't want
379 Google to filter similar results you can set the extra_params to
380 {'filter': '0'} which will append '&filter=0' to every query.
381 :param str tpe: Search type (images, videos, news, shopping, books, apps)
382 Use the following values {videos: 'vid', images: 'isch',
383 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
384 :param str or None user_agent: User agent for the HTTP requests.
385 Use None for the default.
386
387 :rtype: generator of str
388 :return: Generator (iterator) that yields found URLs.
389 If the stop parameter is None the iterator will loop forever.
390 """
391 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
392 pause, only_standard, extra_params, tpe='isch')
393
394
395 # Shortcut to search news.
396 def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
397 start=0, stop=None, domains=None, pause=2.0,
398 only_standard=False, extra_params={}):
399 """
400 Shortcut to search news.
401
402 :param str query: Query string. Must NOT be url-encoded.
403 :param str tld: Top level domain.
404 :param str lang: Language.
405 :param str tbs: Time limits (i.e "qdr:h" => last hour,
406 "qdr:d" => last 24 hours, "qdr:m" => last month).
407 :param str safe: Safe search.
408 :param int num: Number of results per page.
409 :param int start: First result to retrieve.
410 :param int or None stop: Last result to retrieve.
411 Use None to keep searching forever.
412 :param list of str or None domains: A list of web domains to constrain
413 the search.
414 :param float pause: Lapse to wait between HTTP requests.
415 A lapse too long will make the search slow, but a lapse too short may
416 cause Google to block your IP. Your mileage may vary!
417 :param bool only_standard: If True, only returns the standard results from
418 each page. If False, it returns every possible link from each page,
419 except for those that point back to Google itself. Defaults to False
420 for backwards compatibility with older versions of this module.
421 :param dict of str to str extra_params: A dictionary of extra HTTP GET
422 parameters, which must be URL encoded. For example if you don't want
423 Google to filter similar results you can set the extra_params to
424 {'filter': '0'} which will append '&filter=0' to every query.
425 :param str tpe: Search type (images, videos, news, shopping, books, apps)
426 Use the following values {videos: 'vid', images: 'isch',
427 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
428 :param str or None user_agent: User agent for the HTTP requests.
429 Use None for the default.
430
431 :rtype: generator of str
432 :return: Generator (iterator) that yields found URLs.
433 If the stop parameter is None the iterator will loop forever.
434 """
435 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
436 pause, only_standard, extra_params, tpe='nws')
437
438
439 # Shortcut to search videos.
440 def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
441 start=0, stop=None, domains=None, pause=2.0,
442 only_standard=False, extra_params={}):
443 """
444 Shortcut to search videos.
445
446 :param str query: Query string. Must NOT be url-encoded.
447 :param str tld: Top level domain.
448 :param str lang: Language.
449 :param str tbs: Time limits (i.e "qdr:h" => last hour,
450 "qdr:d" => last 24 hours, "qdr:m" => last month).
451 :param str safe: Safe search.
452 :param int num: Number of results per page.
453 :param int start: First result to retrieve.
454 :param int or None stop: Last result to retrieve.
455 Use None to keep searching forever.
456 :param list of str or None domains: A list of web domains to constrain
457 the search.
458 :param float pause: Lapse to wait between HTTP requests.
459 A lapse too long will make the search slow, but a lapse too short may
460 cause Google to block your IP. Your mileage may vary!
461 :param bool only_standard: If True, only returns the standard results from
462 each page. If False, it returns every possible link from each page,
463 except for those that point back to Google itself. Defaults to False
464 for backwards compatibility with older versions of this module.
465 :param dict of str to str extra_params: A dictionary of extra HTTP GET
466 parameters, which must be URL encoded. For example if you don't want
467 Google to filter similar results you can set the extra_params to
468 {'filter': '0'} which will append '&filter=0' to every query.
469 :param str tpe: Search type (images, videos, news, shopping, books, apps)
470 Use the following values {videos: 'vid', images: 'isch',
471 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
472 :param str or None user_agent: User agent for the HTTP requests.
473 Use None for the default.
474
475 :rtype: generator of str
476 :return: Generator (iterator) that yields found URLs.
477 If the stop parameter is None the iterator will loop forever.
478 """
479 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
480 pause, only_standard, extra_params, tpe='vid')
481
482
483 # Shortcut to search shop.
484 def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
485 start=0, stop=None, domains=None, pause=2.0,
486 only_standard=False, extra_params={}):
487 """
488 Shortcut to search shop.
489
490 :param str query: Query string. Must NOT be url-encoded.
491 :param str tld: Top level domain.
492 :param str lang: Language.
493 :param str tbs: Time limits (i.e "qdr:h" => last hour,
494 "qdr:d" => last 24 hours, "qdr:m" => last month).
495 :param str safe: Safe search.
496 :param int num: Number of results per page.
497 :param int start: First result to retrieve.
498 :param int or None stop: Last result to retrieve.
499 Use None to keep searching forever.
500 :param list of str or None domains: A list of web domains to constrain
501 the search.
502 :param float pause: Lapse to wait between HTTP requests.
503 A lapse too long will make the search slow, but a lapse too short may
504 cause Google to block your IP. Your mileage may vary!
505 :param bool only_standard: If True, only returns the standard results from
506 each page. If False, it returns every possible link from each page,
507 except for those that point back to Google itself. Defaults to False
508 for backwards compatibility with older versions of this module.
509 :param dict of str to str extra_params: A dictionary of extra HTTP GET
510 parameters, which must be URL encoded. For example if you don't want
511 Google to filter similar results you can set the extra_params to
512 {'filter': '0'} which will append '&filter=0' to every query.
513 :param str tpe: Search type (images, videos, news, shopping, books, apps)
514 Use the following values {videos: 'vid', images: 'isch',
515 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
516 :param str or None user_agent: User agent for the HTTP requests.
517 Use None for the default.
518
519 :rtype: generator of str
520 :return: Generator (iterator) that yields found URLs.
521 If the stop parameter is None the iterator will loop forever.
522 """
523 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
524 pause, only_standard, extra_params, tpe='shop')
525
526
527 # Shortcut to search books.
528 def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
529 start=0, stop=None, domains=None, pause=2.0,
530 only_standard=False, extra_params={}):
531 """
532 Shortcut to search books.
533
534 :param str query: Query string. Must NOT be url-encoded.
535 :param str tld: Top level domain.
536 :param str lang: Language.
537 :param str tbs: Time limits (i.e "qdr:h" => last hour,
538 "qdr:d" => last 24 hours, "qdr:m" => last month).
539 :param str safe: Safe search.
540 :param int num: Number of results per page.
541 :param int start: First result to retrieve.
542 :param int or None stop: Last result to retrieve.
543 Use None to keep searching forever.
544 :param list of str or None domains: A list of web domains to constrain
545 the search.
546 :param float pause: Lapse to wait between HTTP requests.
547 A lapse too long will make the search slow, but a lapse too short may
548 cause Google to block your IP. Your mileage may vary!
549 :param bool only_standard: If True, only returns the standard results from
550 each page. If False, it returns every possible link from each page,
551 except for those that point back to Google itself. Defaults to False
552 for backwards compatibility with older versions of this module.
553 :param dict of str to str extra_params: A dictionary of extra HTTP GET
554 parameters, which must be URL encoded. For example if you don't want
555 Google to filter similar results you can set the extra_params to
556 {'filter': '0'} which will append '&filter=0' to every query.
557 :param str tpe: Search type (images, videos, news, shopping, books, apps)
558 Use the following values {videos: 'vid', images: 'isch',
559 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
560 :param str or None user_agent: User agent for the HTTP requests.
561 Use None for the default.
562
563 :rtype: generator of str
564 :return: Generator (iterator) that yields found URLs.
565 If the stop parameter is None the iterator will loop forever.
566 """
567 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
568 pause, only_standard, extra_params, tpe='bks')
569
570
571 # Shortcut to search apps.
572 def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
573 start=0, stop=None, domains=None, pause=2.0,
574 only_standard=False, extra_params={}):
575 """
576 Shortcut to search apps.
577
578 :param str query: Query string. Must NOT be url-encoded.
579 :param str tld: Top level domain.
580 :param str lang: Language.
581 :param str tbs: Time limits (i.e "qdr:h" => last hour,
582 "qdr:d" => last 24 hours, "qdr:m" => last month).
583 :param str safe: Safe search.
584 :param int num: Number of results per page.
585 :param int start: First result to retrieve.
586 :param int or None stop: Last result to retrieve.
587 Use None to keep searching forever.
588 :param list of str or None domains: A list of web domains to constrain
589 the search.
590 :param float pause: Lapse to wait between HTTP requests.
591 A lapse too long will make the search slow, but a lapse too short may
592 cause Google to block your IP. Your mileage may vary!
593 :param bool only_standard: If True, only returns the standard results from
594 each page. If False, it returns every possible link from each page,
595 except for those that point back to Google itself. Defaults to False
596 for backwards compatibility with older versions of this module.
597 :param dict of str to str extra_params: A dictionary of extra HTTP GET
598 parameters, which must be URL encoded. For example if you don't want
599 Google to filter similar results you can set the extra_params to
600 {'filter': '0'} which will append '&filter=0' to every query.
601 :param str tpe: Search type (images, videos, news, shopping, books, apps)
602 Use the following values {videos: 'vid', images: 'isch',
603 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
604 :param str or None user_agent: User agent for the HTTP requests.
605 Use None for the default.
606
607 :rtype: generator of str
608 :return: Generator (iterator) that yields found URLs.
609 If the stop parameter is None the iterator will loop forever.
610 """
611 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
612 pause, only_standard, extra_params, tpe='app')
613
614
615364 # Shortcut to single-item search.
616365 # Evaluates the iterator to return the single URL as a string.
617 def lucky(query, tld='com', lang='en', tbs='0', safe='off',
618 only_standard=False, extra_params={}, tpe=''):
366 def lucky(*args, **kwargs):
619367 """
620368 Shortcut to single-item search.
621369
622 :param str query: Query string. Must NOT be url-encoded.
623 :param str tld: Top level domain.
624 :param str lang: Language.
625 :param str tbs: Time limits (i.e "qdr:h" => last hour,
626 "qdr:d" => last 24 hours, "qdr:m" => last month).
627 :param str safe: Safe search.
628 :param int num: Number of results per page.
629 :param int start: First result to retrieve.
630 :param int or None stop: Last result to retrieve.
631 Use None to keep searching forever.
632 :param list of str or None domains: A list of web domains to constrain
633 the search.
634 :param float pause: Lapse to wait between HTTP requests.
635 A lapse too long will make the search slow, but a lapse too short may
636 cause Google to block your IP. Your mileage may vary!
637 :param bool only_standard: If True, only returns the standard results from
638 each page. If False, it returns every possible link from each page,
639 except for those that point back to Google itself. Defaults to False
640 for backwards compatibility with older versions of this module.
641 :param dict of str to str extra_params: A dictionary of extra HTTP GET
642 parameters, which must be URL encoded. For example if you don't want
643 Google to filter similar results you can set the extra_params to
644 {'filter': '0'} which will append '&filter=0' to every query.
645 :param str tpe: Search type (images, videos, news, shopping, books, apps)
646 Use the following values {videos: 'vid', images: 'isch',
647 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
648 :param str or None user_agent: User agent for the HTTP requests.
649 Use None for the default.
370 Same arguments as the main search function, but the return value changes.
650371
651372 :rtype: str
652373 :return: URL found by Google.
653374 """
654 gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
655 extra_params, tpe)
656 return next(gen)
657
658
659 # Returns only the number of Google hits for the given search query.
660 # This is the number reported by Google itself, NOT by scraping.
661 def hits(query, tld='com', lang='en', tbs='0', safe='off',
662 domains=None, extra_params={}, tpe='', user_agent=None):
663 """
664 Search the given query string using Google and return the number of hits.
665
666 :note: This is the number reported by Google itself, NOT by scraping.
667
668 :param str query: Query string. Must NOT be url-encoded.
669 :param str tld: Top level domain.
670 :param str lang: Language.
671 :param str tbs: Time limits (i.e "qdr:h" => last hour,
672 "qdr:d" => last 24 hours, "qdr:m" => last month).
673 :param str safe: Safe search.
674 :param int num: Number of results per page.
675 :param int start: First result to retrieve.
676 :param int or None stop: Last result to retrieve.
677 Use None to keep searching forever.
678 :param list of str or None domains: A list of web domains to constrain
679 the search.
680 :param float pause: Lapse to wait between HTTP requests.
681 A lapse too long will make the search slow, but a lapse too short may
682 cause Google to block your IP. Your mileage may vary!
683 :param bool only_standard: If True, only returns the standard results from
684 each page. If False, it returns every possible link from each page,
685 except for those that point back to Google itself. Defaults to False
686 for backwards compatibility with older versions of this module.
687 :param dict of str to str extra_params: A dictionary of extra HTTP GET
688 parameters, which must be URL encoded. For example if you don't want
689 Google to filter similar results you can set the extra_params to
690 {'filter': '0'} which will append '&filter=0' to every query.
691 :param str tpe: Search type (images, videos, news, shopping, books, apps)
692 Use the following values {videos: 'vid', images: 'isch',
693 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
694 :param str or None user_agent: User agent for the HTTP requests.
695 Use None for the default.
696
697 :rtype: int
698 :return: Number of Google hits for the given search query.
699 """
700
701 # Prepare domain list if it exists.
702 if domains:
703 domain_query = '+OR+'.join('site:' + domain for domain in domains)
704 domain_query = '+' + domain_query
705 else:
706 domain_query = ''
707
708 # Prepare the search string.
709 query = quote_plus(query + domain_query)
710
711 # Check extra_params for overlapping
712 for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
713 if builtin_param in extra_params.keys():
714 raise ValueError(
715 'GET parameter "%s" is overlapping with \
716 the built-in GET parameter',
717 builtin_param
718 )
719
720 # Grab the cookie from the home page.
721 get_page(url_home % vars(), user_agent)
722
723 # Prepare the URL of the first (and in this cases ONLY) request.
724 url = url_search % vars()
725
726 try: # Is it python<3?
727 iter_extra_params = extra_params.iteritems()
728 except AttributeError: # Or python>3?
729 iter_extra_params = extra_params.items()
730 # Append extra GET_parameters to URL
731 for k, v in iter_extra_params:
732 url += url + ('&%s=%s' % (k, v))
733
734 # Request the Google Search results page.
735 html = get_page(url, user_agent)
736
737 # Parse the response.
738 if is_bs4:
739 soup = BeautifulSoup(html, 'html.parser')
740 else:
741 soup = BeautifulSoup(html)
742
743 # Get the number of hits.
744 tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
745 hits_text_parts = tag.text.split()
746 if len(hits_text_parts) < 3:
747 return 0
748 return int(hits_text_parts[1].replace(',', '').replace('.', ''))
749
750
751 def ngd(term1, term2):
752 """
753 Return the Normalized Google distance between words.
754
755 For more info, refer to:
756 https://en.wikipedia.org/wiki/Normalized_Google_distance
757
758 :param str term1: First term to compare.
759 :param str term2: Second term to compare.
760
761 :rtype: float
762 :return: Normalized Google distance between words.
763 """
764
765 lhits1 = math.log10(hits(term1))
766 lhits2 = math.log10(hits(term2))
767 lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
768 npages = hits('the')
769 fix = 1000
770
771 lN = math.log10(npages * fix)
772 numerator = max([lhits1, lhits2]) - lhits_mix
773 denomin = lN - min([lhits1, lhits2])
774
775 return numerator / denomin
375 return next(search(*args, **kwargs))
00 #!/usr/bin/env python
11
2 # Python bindings to the Google search engine
3 # Copyright (c) 2009-2016, Mario Vilas
2 # Copyright (c) 2009-2020, Mario Vilas
43 # All rights reserved.
54 #
65 # Redistribution and use in source and binary forms, with or without
3130
3231 from googlesearch import search, get_random_user_agent
3332
33 # TODO port to argparse
3434 from optparse import OptionParser, IndentedHelpFormatter
35
3536
3637 class BannerHelpFormatter(IndentedHelpFormatter):
3738
4546 msg = IndentedHelpFormatter.format_usage(self, usage)
4647 return '%s\n%s' % (self.banner, msg)
4748
48 # Parse the command line arguments.
49 formatter = BannerHelpFormatter(
50 "Python script to use the Google search engine\n"
51 "By Mario Vilas (mvilas at gmail dot com)\n"
52 "https://github.com/MarioVilas/googlesearch\n"
53 )
54 parser = OptionParser(formatter=formatter)
55 parser.set_usage("%prog [options] query")
56 parser.add_option("--tld", metavar="TLD", type="string", default="com",
57 help="top level domain to use [default: com]")
58 parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
59 help="produce results in the given language [default: en]")
60 parser.add_option("--domains", metavar="DOMAINS", type="string", default="",
61 help="comma separated list of domains to constrain the search to")
62 parser.add_option("--tbs", metavar="TBS", type="string", default="0",
63 help="produce results from period [default: 0]")
64 parser.add_option("--safe", metavar="SAFE", type="string", default="off",
65 help="kids safe search [default: off]")
66 parser.add_option("--num", metavar="NUMBER", type="int", default=10,
67 help="number of results per page [default: 10]")
68 parser.add_option("--start", metavar="NUMBER", type="int", default=0,
69 help="first result to retrieve [default: 0]")
70 parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
71 help="last result to retrieve [default: unlimited]")
72 parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
73 help="pause between HTTP requests [default: 2.0]")
74 parser.add_option("--rua", metavar="USERAGENT", action="store_true", default=False,
75 help="Randomize the User-Agent [default: no]")
76 parser.add_option("--all", dest="only_standard",
77 action="store_false", default=True,
78 help="grab all possible links from result pages [default: only standard results]")
79 (options, args) = parser.parse_args()
80 query = ' '.join(args)
81 if not query:
82 parser.print_help()
83 sys.exit(2)
84 params = [(k, v) for (k, v) in options.__dict__.items() if not k.startswith('_')]
85 params = dict(params)
8649
87 # Split the comma separated list of domains, if present.
88 if 'domains' in params:
89 params['domains'] = [x.strip() for x in params['domains'].split(',')]
50 def main():
9051
91 # Randomize the user agent if requested.
92 if "rua" in params:
93 rua = params.pop("rua")
94 if rua:
95 params["user_agent"] = get_random_user_agent()
52 # Parse the command line arguments.
53 formatter = BannerHelpFormatter(
54 "Python script to use the Google search engine\n"
55 "By Mario Vilas (mvilas at gmail dot com)\n"
56 "https://github.com/MarioVilas/googlesearch\n"
57 )
58 parser = OptionParser(formatter=formatter)
59 parser.set_usage("%prog [options] query")
60 parser.add_option(
61 '--tld', metavar='TLD', type='string', default='com',
62 help="top level domain to use [default: com]")
63 parser.add_option(
64 '--lang', metavar='LANGUAGE', type='string', default='en',
65 help="produce results in the given language [default: en]")
66 parser.add_option(
67 '--tbs', metavar='TBS', type='string', default='0',
68 help="produce results from period [default: 0]")
69 parser.add_option(
70 '--safe', metavar='SAFE', type='string', default='off',
71 help="kids safe search [default: off]")
72 parser.add_option(
73 '--country', metavar='COUNTRY', type='string', default='',
74 help="region to restrict search on [default: not restricted]")
75 parser.add_option(
76 '--num', metavar='NUMBER', type='int', default=10,
77 help="number of results per page [default: 10]")
78 parser.add_option(
79 '--start', metavar='NUMBER', type='int', default=0,
80 help="first result to retrieve [default: 0]")
81 parser.add_option(
82 '--stop', metavar='NUMBER', type='int', default=0,
83 help="last result to retrieve [default: unlimited]")
84 parser.add_option(
85 '--pause', metavar='SECONDS', type='float', default=2.0,
86 help="pause between HTTP requests [default: 2.0]")
87 parser.add_option(
88 '--rua', action='store_true', default=False,
89 help="Randomize the User-Agent [default: no]")
90 parser.add_option(
91 '--insecure', dest="verify_ssl", action='store_false', default=True,
92 help="Randomize the User-Agent [default: no]")
93 (options, args) = parser.parse_args()
94 query = ' '.join(args)
95 if not query:
96 parser.print_help()
97 sys.exit(2)
98 params = [
99 (k, v) for (k, v) in options.__dict__.items()
100 if not k.startswith('_')]
101 params = dict(params)
96102
97 # Run the query.
98 for url in search(query, **params):
99 print(url)
100 try:
101 sys.stdout.flush()
102 except:
103 pass
103 # Randomize the user agent if requested.
104 if 'rua' in params and params.pop('rua'):
105 params['user_agent'] = get_random_user_agent()
106
107 # Run the query.
108 for url in search(query, **params):
109 print(url)
110 try:
111 sys.stdout.flush()
112 except Exception:
113 pass
114
115
116 if __name__ == '__main__':
117 main()
00 #!/usr/bin/env python
11
2 # Copyright (c) 2009-2019, Mario Vilas
2 # Copyright (c) 2009-2020, Mario Vilas
33 # All rights reserved.
44 #
55 # Redistribution and use in source and binary forms, with or without
4343 scripts=[join('scripts', 'google')],
4444 package_data={'googlesearch': ['user_agents.txt.gz']},
4545 include_package_data=True,
46 version="2.0.2",
46 version="3.0.0",
4747 description="Python bindings to the Google search engine.",
4848 author="Mario Vilas",
4949 author_email="[email protected]",
6666 except ImportError:
6767 from distutils.core import setup
6868
69 # Get the long description from the readme file.
70 try:
71 metadata['long_description'] = open(join(here, 'README.md'), 'rU').read()
72 except Exception:
73 pass
74
7569 # Run the setup script.
7670 setup(**metadata)