Commit c9fea811-ed52-471e-b681-9f6bdd469bed/upstream - python-googlesearch

Import upstream version 3.0.0 Kali Janitor 3 years ago

5 changed file(s) with 168 addition(s) and 598 deletion(s). Raw diff Collapse all Expand all

-21

PKG-INFO less more

0	0	Metadata-Version: 1.1
1	1	Name: google
2		Version: 2.0.2
	2	Version: 3.0.0
3	3	Summary: Python bindings to the Google search engine.
4	4	Home-page: http://breakingcode.wordpress.com/
5	5	Author: Mario Vilas
6	6	Author-email: [email protected]
7	7	License: UNKNOWN
8		Description: googlesearch
9		============
10
11		Google search from Python.
12
13		https://python-googlesearch.readthedocs.io/en/latest/
14
15		Usage example
16		-------------
17
18		# Get the first 20 hits for: "Breaking Code" WordPress blog
19		from googlesearch import search
20		for url in search('"Breaking Code" WordPress blog', stop=20):
21		print(url)
22
23		Installing
24		----------
25
26		pip install google
27
	8	Description: UNKNOWN
28	9	Platform: UNKNOWN
29	10	Classifier: Development Status :: 5 - Production/Stable
30	11	Classifier: Intended Audience :: Developers

-21

google.egg-info/PKG-INFO less more

0	0	Metadata-Version: 1.1
1	1	Name: google
2		Version: 2.0.2
	2	Version: 3.0.0
3	3	Summary: Python bindings to the Google search engine.
4	4	Home-page: http://breakingcode.wordpress.com/
5	5	Author: Mario Vilas
6	6	Author-email: [email protected]
7	7	License: UNKNOWN
8		Description: googlesearch
9		============
10
11		Google search from Python.
12
13		https://python-googlesearch.readthedocs.io/en/latest/
14
15		Usage example
16		-------------
17
18		# Get the first 20 hits for: "Breaking Code" WordPress blog
19		from googlesearch import search
20		for url in search('"Breaking Code" WordPress blog', stop=20):
21		print(url)
22
23		Installing
24		----------
25
26		pip install google
27
	8	Description: UNKNOWN
28	9	Platform: UNKNOWN
29	10	Classifier: Development Status :: 5 - Production/Stable
30	11	Classifier: Intended Audience :: Developers

+93

-493

googlesearch/__init__.py less more

0	0	#!/usr/bin/env python
1	1
2		# Python bindings to the Google search engine
3		# Copyright (c) 2009-2018, Mario Vilas
	2	# Copyright (c) 2009-2020, Mario Vilas
4	3	# All rights reserved.
5	4	#
6	5	# Redistribution and use in source and binary forms, with or without

31	30	import random
32	31	import sys
33	32	import time
34		import math
	33	import ssl
35	34
36	35	if sys.version_info[0] > 2:
37	36	from http.cookiejar import LWPCookieJar

55	54	# Main search function.
56	55	'search',
57	56
58		# Specialized search functions.
59		'search_images', 'search_news',
60		'search_videos', 'search_shop',
61		'search_books', 'search_apps',
62
63	57	# Shortcut for "get lucky" search.
64	58	'lucky',
65	59
66		# Computations based on the number of Google hits.
67		'hits', 'ngd',
68
69	60	# Miscellaneous utility functions.
70		'get_random_user_agent',
	61	'get_random_user_agent', 'get_tbs',
71	62	]
72	63
73	64	# URL templates to make Google searches.
74	65	url_home = "https://www.google.%(tld)s/"
75	66	url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
76		"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
	67	"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
	68	"cr=%(country)s"
77	69	url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
78		"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
	70	"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
	71	"cr=%(country)s"
79	72	url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
80	73	"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
81		"tbm=%(tpe)s"
	74	"cr=%(country)s"
82	75	url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
83	76	"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
84		"safe=%(safe)s&tbm=%(tpe)s"
	77	"safe=%(safe)s&cr=%(country)s"
	78	url_parameters = (
	79	'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')
85	80
86	81	# Cookie jar. Stored at the user's home folder.
	82	# If the cookie jar is inaccessible, the errors are ignored.
87	83	home_folder = os.getenv('HOME')
88	84	if not home_folder:
89	85	home_folder = os.getenv('USERHOME')

99	95	USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
100	96
101	97	# Load the list of valid user agents from the install folder.
	98	# The search order is:
	99	# * user_agents.txt.gz
	100	# * user_agents.txt
	101	# * default user agent
102	102	try:
103	103	install_folder = os.path.abspath(os.path.split(__file__)[0])
104	104	try:

129	129	return random.choice(user_agents_list)
130	130
131	131
	132	# Helper function to format the tbs parameter.
	133	def get_tbs(from_date, to_date):
	134	"""
	135	Helper function to format the tbs parameter.
	136
	137	:param datetime.date from_date: Python date object.
	138	:param datetime.date to_date: Python date object.
	139
	140	:rtype: str
	141	:return: Dates encoded in tbs format.
	142	"""
	143	from_date = from_date.strftime('%m/%d/%Y')
	144	to_date = to_date.strftime('%m/%d/%Y')
	145	return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
	146
	147
132	148	# Request the given URL and return the response page, using the cookie jar.
133		def get_page(url, user_agent=None):
	149	# If the cookie jar is inaccessible, the errors are ignored.
	150	def get_page(url, user_agent=None, verify_ssl=True):
134	151	"""
135	152	Request the given URL and return the response page, using the cookie jar.
136	153
137	154	:param str url: URL to retrieve.
138	155	:param str user_agent: User agent for the HTTP requests.
139	156	Use None for the default.
	157	:param bool verify_ssl: Verify the SSL certificate to prevent
	158	traffic interception attacks. Defaults to True.
140	159
141	160	:rtype: str
142	161	:return: Web page retrieved for the given URL.

150	169	request = Request(url)
151	170	request.add_header('User-Agent', user_agent)
152	171	cookie_jar.add_cookie_header(request)
153		response = urlopen(request)
	172	if verify_ssl:
	173	response = urlopen(request)
	174	else:
	175	context = ssl._create_unverified_context()
	176	response = urlopen(request, context=context)
154	177	cookie_jar.extract_cookies(response, request)
155	178	html = response.read()
156	179	response.close()

166	189	def filter_result(link):
167	190	try:
168	191
169		# Valid results are absolute URLs not pointing to a Google domain
170		# like images.google.com or googleusercontent.com
	192	# Decode hidden URLs.
	193	if link.startswith('/url?'):
	194	o = urlparse(link, 'http')
	195	link = parse_qs(o.query)['q'][0]
	196
	197	# Valid results are absolute URLs not pointing to a Google domain,
	198	# like images.google.com or googleusercontent.com for example.
	199	# TODO this could be improved!
171	200	o = urlparse(link, 'http')
172	201	if o.netloc and 'google' not in o.netloc:
173	202	return link
174	203
175		# Decode hidden URLs.
176		if link.startswith('/url?'):
177		link = parse_qs(o.query)['q'][0]
178
179		# Valid results are absolute URLs not pointing to a Google domain
180		# like images.google.com or googleusercontent.com
181		o = urlparse(link, 'http')
182		if o.netloc and 'google' not in o.netloc:
183		return link
184
185		# Otherwise, or on error, return None.
	204	# On error, return None.
186	205	except Exception:
187	206	pass
188		return None
189	207
190	208
191	209	# Returns a generator that yields URLs.
192	210	def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
193		stop=None, domains=None, pause=2.0, only_standard=False,
194		extra_params={}, tpe='', user_agent=None):
	211	stop=None, pause=2.0, country='', extra_params=None,
	212	user_agent=None, verify_ssl=True):
195	213	"""
196	214	Search the given query string using Google.
197	215

203	221	:param str safe: Safe search.
204	222	:param int num: Number of results per page.
205	223	:param int start: First result to retrieve.
206		:param int or None stop: Last result to retrieve.
	224	:param int stop: Last result to retrieve.
207	225	Use None to keep searching forever.
208		:param list of str or None domains: A list of web domains to constrain
209		the search.
210	226	:param float pause: Lapse to wait between HTTP requests.
211	227	A lapse too long will make the search slow, but a lapse too short may
212	228	cause Google to block your IP. Your mileage may vary!
213		:param bool only_standard: If True, only returns the standard results from
214		each page. If False, it returns every possible link from each page,
215		except for those that point back to Google itself. Defaults to False
216		for backwards compatibility with older versions of this module.
217		:param dict of str to str extra_params: A dictionary of extra HTTP GET
	229	:param str country: Country or region to focus the search on. Similar to
	230	changing the TLD, but does not yield exactly the same results.
	231	Only Google knows why...
	232	:param dict extra_params: A dictionary of extra HTTP GET
218	233	parameters, which must be URL encoded. For example if you don't want
219	234	Google to filter similar results you can set the extra_params to
220	235	{'filter': '0'} which will append '&filter=0' to every query.
221		:param str tpe: Search type (images, videos, news, shopping, books, apps)
222		Use the following values {videos: 'vid', images: 'isch',
223		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
224		:param str or None user_agent: User agent for the HTTP requests.
	236	:param str user_agent: User agent for the HTTP requests.
225	237	Use None for the default.
	238	:param bool verify_ssl: Verify the SSL certificate to prevent
	239	traffic interception attacks. Defaults to True.
226	240
227	241	:rtype: generator of str
228	242	:return: Generator (iterator) that yields found URLs.

232	246	# This is used to avoid repeated results.
233	247	hashes = set()
234	248
235		# Count the number of links yielded
	249	# Count the number of links yielded.
236	250	count = 0
237
238		# Prepare domain list if it exists.
239		if domains:
240		query = query + ' ' + ' OR '.join(
241		'site:' + domain for domain in domains)
242	251
243	252	# Prepare the search string.
244	253	query = quote_plus(query)
245	254
246		# Check extra_params for overlapping
247		for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
	255	# If no extra_params is given, create an empty dictionary.
	256	# We should avoid using an empty dictionary as a default value
	257	# in a function parameter in Python.
	258	if not extra_params:
	259	extra_params = {}
	260
	261	# Check extra_params for overlapping.
	262	for builtin_param in url_parameters:
248	263	if builtin_param in extra_params.keys():
249	264	raise ValueError(
250	265	'GET parameter "%s" is overlapping with \

253	268	)
254	269
255	270	# Grab the cookie from the home page.
256		get_page(url_home % vars(), user_agent)
	271	get_page(url_home % vars(), user_agent, verify_ssl)
257	272
258	273	# Prepare the URL of the first request.
259	274	if start:

269	284
270	285	# Loop until we reach the maximum result, if any (otherwise, loop forever).
271	286	while not stop or count < stop:
272		# Remeber last count to detect the end of results
	287
	288	# Remeber last count to detect the end of results.
273	289	last_count = count
274	290
275		try: # Is it python<3?
276		iter_extra_params = extra_params.iteritems()
277		except AttributeError: # Or python>3?
278		iter_extra_params = extra_params.items()
279		# Append extra GET_parameters to URL
280		for k, v in iter_extra_params:
281		url += url + ('&%s=%s' % (k, v))
	291	# Append extra GET parameters to the URL.
	292	# This is done on every iteration because we're
	293	# rebuilding the entire URL at the end of this loop.
	294	for k, v in extra_params.items():
	295	k = quote_plus(k)
	296	v = quote_plus(v)
	297	url = url + ('&%s=%s' % (k, v))
282	298
283	299	# Sleep between requests.
	300	# Keeps Google from banning you for making too many requests.
284	301	time.sleep(pause)
285	302
286	303	# Request the Google Search results page.
287		html = get_page(url, user_agent)
288
289		# Parse the response and process every anchored URL.
	304	html = get_page(url, user_agent, verify_ssl)
	305
	306	# Parse the response and get every anchored URL.
290	307	if is_bs4:
291	308	soup = BeautifulSoup(html, 'html.parser')
292	309	else:

294	311	try:
295	312	anchors = soup.find(id='search').findAll('a')
296	313	# Sometimes (depending on the User-agent) there is
297		# no id "search" in html response
	314	# no id "search" in html response...
298	315	except AttributeError:
299		# Remove links of the top bar
	316	# Remove links of the top bar.
300	317	gbar = soup.find(id='gbar')
301	318	if gbar:
302	319	gbar.clear()
303	320	anchors = soup.findAll('a')
	321
	322	# Process every anchored URL.
304	323	for a in anchors:
305
306		# Leave only the "standard" results if requested.
307		# Otherwise grab all possible links.
308		if only_standard and (
309		not a.parent or a.parent.name.lower() != "h3"):
310		continue
311	324
312	325	# Get the URL from the anchor tag.
313	326	try:

329	342	# Yield the result.
330	343	yield link
331	344
	345	# Increase the results counter.
	346	# If we reached the limit, stop.
332	347	count += 1
333	348	if stop and count >= stop:
334	349	return
335	350
336	351	# End if there are no more results.
	352	# XXX TODO review this logic, not sure if this is still true!
337	353	if last_count == count:
338	354	break
339	355

345	361	url = url_next_page_num % vars()
346	362
347	363
348		# Shortcut to search images.
349		# Beware, this does not return the image link.
350		def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
351		start=0, stop=None, pause=2.0, domains=None,
352		only_standard=False, extra_params={}):
353		"""
354		Shortcut to search images.
355
356		:note: Beware, this does not return the image link.
357
358		:param str query: Query string. Must NOT be url-encoded.
359		:param str tld: Top level domain.
360		:param str lang: Language.
361		:param str tbs: Time limits (i.e "qdr:h" => last hour,
362		"qdr:d" => last 24 hours, "qdr:m" => last month).
363		:param str safe: Safe search.
364		:param int num: Number of results per page.
365		:param int start: First result to retrieve.
366		:param int or None stop: Last result to retrieve.
367		Use None to keep searching forever.
368		:param list of str or None domains: A list of web domains to constrain
369		the search.
370		:param float pause: Lapse to wait between HTTP requests.
371		A lapse too long will make the search slow, but a lapse too short may
372		cause Google to block your IP. Your mileage may vary!
373		:param bool only_standard: If True, only returns the standard results from
374		each page. If False, it returns every possible link from each page,
375		except for those that point back to Google itself. Defaults to False
376		for backwards compatibility with older versions of this module.
377		:param dict of str to str extra_params: A dictionary of extra HTTP GET
378		parameters, which must be URL encoded. For example if you don't want
379		Google to filter similar results you can set the extra_params to
380		{'filter': '0'} which will append '&filter=0' to every query.
381		:param str tpe: Search type (images, videos, news, shopping, books, apps)
382		Use the following values {videos: 'vid', images: 'isch',
383		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
384		:param str or None user_agent: User agent for the HTTP requests.
385		Use None for the default.
386
387		:rtype: generator of str
388		:return: Generator (iterator) that yields found URLs.
389		If the stop parameter is None the iterator will loop forever.
390		"""
391		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
392		pause, only_standard, extra_params, tpe='isch')
393
394
395		# Shortcut to search news.
396		def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
397		start=0, stop=None, domains=None, pause=2.0,
398		only_standard=False, extra_params={}):
399		"""
400		Shortcut to search news.
401
402		:param str query: Query string. Must NOT be url-encoded.
403		:param str tld: Top level domain.
404		:param str lang: Language.
405		:param str tbs: Time limits (i.e "qdr:h" => last hour,
406		"qdr:d" => last 24 hours, "qdr:m" => last month).
407		:param str safe: Safe search.
408		:param int num: Number of results per page.
409		:param int start: First result to retrieve.
410		:param int or None stop: Last result to retrieve.
411		Use None to keep searching forever.
412		:param list of str or None domains: A list of web domains to constrain
413		the search.
414		:param float pause: Lapse to wait between HTTP requests.
415		A lapse too long will make the search slow, but a lapse too short may
416		cause Google to block your IP. Your mileage may vary!
417		:param bool only_standard: If True, only returns the standard results from
418		each page. If False, it returns every possible link from each page,
419		except for those that point back to Google itself. Defaults to False
420		for backwards compatibility with older versions of this module.
421		:param dict of str to str extra_params: A dictionary of extra HTTP GET
422		parameters, which must be URL encoded. For example if you don't want
423		Google to filter similar results you can set the extra_params to
424		{'filter': '0'} which will append '&filter=0' to every query.
425		:param str tpe: Search type (images, videos, news, shopping, books, apps)
426		Use the following values {videos: 'vid', images: 'isch',
427		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
428		:param str or None user_agent: User agent for the HTTP requests.
429		Use None for the default.
430
431		:rtype: generator of str
432		:return: Generator (iterator) that yields found URLs.
433		If the stop parameter is None the iterator will loop forever.
434		"""
435		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
436		pause, only_standard, extra_params, tpe='nws')
437
438
439		# Shortcut to search videos.
440		def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
441		start=0, stop=None, domains=None, pause=2.0,
442		only_standard=False, extra_params={}):
443		"""
444		Shortcut to search videos.
445
446		:param str query: Query string. Must NOT be url-encoded.
447		:param str tld: Top level domain.
448		:param str lang: Language.
449		:param str tbs: Time limits (i.e "qdr:h" => last hour,
450		"qdr:d" => last 24 hours, "qdr:m" => last month).
451		:param str safe: Safe search.
452		:param int num: Number of results per page.
453		:param int start: First result to retrieve.
454		:param int or None stop: Last result to retrieve.
455		Use None to keep searching forever.
456		:param list of str or None domains: A list of web domains to constrain
457		the search.
458		:param float pause: Lapse to wait between HTTP requests.
459		A lapse too long will make the search slow, but a lapse too short may
460		cause Google to block your IP. Your mileage may vary!
461		:param bool only_standard: If True, only returns the standard results from
462		each page. If False, it returns every possible link from each page,
463		except for those that point back to Google itself. Defaults to False
464		for backwards compatibility with older versions of this module.
465		:param dict of str to str extra_params: A dictionary of extra HTTP GET
466		parameters, which must be URL encoded. For example if you don't want
467		Google to filter similar results you can set the extra_params to
468		{'filter': '0'} which will append '&filter=0' to every query.
469		:param str tpe: Search type (images, videos, news, shopping, books, apps)
470		Use the following values {videos: 'vid', images: 'isch',
471		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
472		:param str or None user_agent: User agent for the HTTP requests.
473		Use None for the default.
474
475		:rtype: generator of str
476		:return: Generator (iterator) that yields found URLs.
477		If the stop parameter is None the iterator will loop forever.
478		"""
479		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
480		pause, only_standard, extra_params, tpe='vid')
481
482
483		# Shortcut to search shop.
484		def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
485		start=0, stop=None, domains=None, pause=2.0,
486		only_standard=False, extra_params={}):
487		"""
488		Shortcut to search shop.
489
490		:param str query: Query string. Must NOT be url-encoded.
491		:param str tld: Top level domain.
492		:param str lang: Language.
493		:param str tbs: Time limits (i.e "qdr:h" => last hour,
494		"qdr:d" => last 24 hours, "qdr:m" => last month).
495		:param str safe: Safe search.
496		:param int num: Number of results per page.
497		:param int start: First result to retrieve.
498		:param int or None stop: Last result to retrieve.
499		Use None to keep searching forever.
500		:param list of str or None domains: A list of web domains to constrain
501		the search.
502		:param float pause: Lapse to wait between HTTP requests.
503		A lapse too long will make the search slow, but a lapse too short may
504		cause Google to block your IP. Your mileage may vary!
505		:param bool only_standard: If True, only returns the standard results from
506		each page. If False, it returns every possible link from each page,
507		except for those that point back to Google itself. Defaults to False
508		for backwards compatibility with older versions of this module.
509		:param dict of str to str extra_params: A dictionary of extra HTTP GET
510		parameters, which must be URL encoded. For example if you don't want
511		Google to filter similar results you can set the extra_params to
512		{'filter': '0'} which will append '&filter=0' to every query.
513		:param str tpe: Search type (images, videos, news, shopping, books, apps)
514		Use the following values {videos: 'vid', images: 'isch',
515		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
516		:param str or None user_agent: User agent for the HTTP requests.
517		Use None for the default.
518
519		:rtype: generator of str
520		:return: Generator (iterator) that yields found URLs.
521		If the stop parameter is None the iterator will loop forever.
522		"""
523		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
524		pause, only_standard, extra_params, tpe='shop')
525
526
527		# Shortcut to search books.
528		def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
529		start=0, stop=None, domains=None, pause=2.0,
530		only_standard=False, extra_params={}):
531		"""
532		Shortcut to search books.
533
534		:param str query: Query string. Must NOT be url-encoded.
535		:param str tld: Top level domain.
536		:param str lang: Language.
537		:param str tbs: Time limits (i.e "qdr:h" => last hour,
538		"qdr:d" => last 24 hours, "qdr:m" => last month).
539		:param str safe: Safe search.
540		:param int num: Number of results per page.
541		:param int start: First result to retrieve.
542		:param int or None stop: Last result to retrieve.
543		Use None to keep searching forever.
544		:param list of str or None domains: A list of web domains to constrain
545		the search.
546		:param float pause: Lapse to wait between HTTP requests.
547		A lapse too long will make the search slow, but a lapse too short may
548		cause Google to block your IP. Your mileage may vary!
549		:param bool only_standard: If True, only returns the standard results from
550		each page. If False, it returns every possible link from each page,
551		except for those that point back to Google itself. Defaults to False
552		for backwards compatibility with older versions of this module.
553		:param dict of str to str extra_params: A dictionary of extra HTTP GET
554		parameters, which must be URL encoded. For example if you don't want
555		Google to filter similar results you can set the extra_params to
556		{'filter': '0'} which will append '&filter=0' to every query.
557		:param str tpe: Search type (images, videos, news, shopping, books, apps)
558		Use the following values {videos: 'vid', images: 'isch',
559		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
560		:param str or None user_agent: User agent for the HTTP requests.
561		Use None for the default.
562
563		:rtype: generator of str
564		:return: Generator (iterator) that yields found URLs.
565		If the stop parameter is None the iterator will loop forever.
566		"""
567		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
568		pause, only_standard, extra_params, tpe='bks')
569
570
571		# Shortcut to search apps.
572		def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
573		start=0, stop=None, domains=None, pause=2.0,
574		only_standard=False, extra_params={}):
575		"""
576		Shortcut to search apps.
577
578		:param str query: Query string. Must NOT be url-encoded.
579		:param str tld: Top level domain.
580		:param str lang: Language.
581		:param str tbs: Time limits (i.e "qdr:h" => last hour,
582		"qdr:d" => last 24 hours, "qdr:m" => last month).
583		:param str safe: Safe search.
584		:param int num: Number of results per page.
585		:param int start: First result to retrieve.
586		:param int or None stop: Last result to retrieve.
587		Use None to keep searching forever.
588		:param list of str or None domains: A list of web domains to constrain
589		the search.
590		:param float pause: Lapse to wait between HTTP requests.
591		A lapse too long will make the search slow, but a lapse too short may
592		cause Google to block your IP. Your mileage may vary!
593		:param bool only_standard: If True, only returns the standard results from
594		each page. If False, it returns every possible link from each page,
595		except for those that point back to Google itself. Defaults to False
596		for backwards compatibility with older versions of this module.
597		:param dict of str to str extra_params: A dictionary of extra HTTP GET
598		parameters, which must be URL encoded. For example if you don't want
599		Google to filter similar results you can set the extra_params to
600		{'filter': '0'} which will append '&filter=0' to every query.
601		:param str tpe: Search type (images, videos, news, shopping, books, apps)
602		Use the following values {videos: 'vid', images: 'isch',
603		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
604		:param str or None user_agent: User agent for the HTTP requests.
605		Use None for the default.
606
607		:rtype: generator of str
608		:return: Generator (iterator) that yields found URLs.
609		If the stop parameter is None the iterator will loop forever.
610		"""
611		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
612		pause, only_standard, extra_params, tpe='app')
613
614
615	364	# Shortcut to single-item search.
616	365	# Evaluates the iterator to return the single URL as a string.
617		def lucky(query, tld='com', lang='en', tbs='0', safe='off',
618		only_standard=False, extra_params={}, tpe=''):
	366	def lucky(args, *kwargs):
619	367	"""
620	368	Shortcut to single-item search.
621	369
622		:param str query: Query string. Must NOT be url-encoded.
623		:param str tld: Top level domain.
624		:param str lang: Language.
625		:param str tbs: Time limits (i.e "qdr:h" => last hour,
626		"qdr:d" => last 24 hours, "qdr:m" => last month).
627		:param str safe: Safe search.
628		:param int num: Number of results per page.
629		:param int start: First result to retrieve.
630		:param int or None stop: Last result to retrieve.
631		Use None to keep searching forever.
632		:param list of str or None domains: A list of web domains to constrain
633		the search.
634		:param float pause: Lapse to wait between HTTP requests.
635		A lapse too long will make the search slow, but a lapse too short may
636		cause Google to block your IP. Your mileage may vary!
637		:param bool only_standard: If True, only returns the standard results from
638		each page. If False, it returns every possible link from each page,
639		except for those that point back to Google itself. Defaults to False
640		for backwards compatibility with older versions of this module.
641		:param dict of str to str extra_params: A dictionary of extra HTTP GET
642		parameters, which must be URL encoded. For example if you don't want
643		Google to filter similar results you can set the extra_params to
644		{'filter': '0'} which will append '&filter=0' to every query.
645		:param str tpe: Search type (images, videos, news, shopping, books, apps)
646		Use the following values {videos: 'vid', images: 'isch',
647		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
648		:param str or None user_agent: User agent for the HTTP requests.
649		Use None for the default.
	370	Same arguments as the main search function, but the return value changes.
650	371
651	372	:rtype: str
652	373	:return: URL found by Google.
653	374	"""
654		gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
655		extra_params, tpe)
656		return next(gen)
657
658
659		# Returns only the number of Google hits for the given search query.
660		# This is the number reported by Google itself, NOT by scraping.
661		def hits(query, tld='com', lang='en', tbs='0', safe='off',
662		domains=None, extra_params={}, tpe='', user_agent=None):
663		"""
664		Search the given query string using Google and return the number of hits.
665
666		:note: This is the number reported by Google itself, NOT by scraping.
667
668		:param str query: Query string. Must NOT be url-encoded.
669		:param str tld: Top level domain.
670		:param str lang: Language.
671		:param str tbs: Time limits (i.e "qdr:h" => last hour,
672		"qdr:d" => last 24 hours, "qdr:m" => last month).
673		:param str safe: Safe search.
674		:param int num: Number of results per page.
675		:param int start: First result to retrieve.
676		:param int or None stop: Last result to retrieve.
677		Use None to keep searching forever.
678		:param list of str or None domains: A list of web domains to constrain
679		the search.
680		:param float pause: Lapse to wait between HTTP requests.
681		A lapse too long will make the search slow, but a lapse too short may
682		cause Google to block your IP. Your mileage may vary!
683		:param bool only_standard: If True, only returns the standard results from
684		each page. If False, it returns every possible link from each page,
685		except for those that point back to Google itself. Defaults to False
686		for backwards compatibility with older versions of this module.
687		:param dict of str to str extra_params: A dictionary of extra HTTP GET
688		parameters, which must be URL encoded. For example if you don't want
689		Google to filter similar results you can set the extra_params to
690		{'filter': '0'} which will append '&filter=0' to every query.
691		:param str tpe: Search type (images, videos, news, shopping, books, apps)
692		Use the following values {videos: 'vid', images: 'isch',
693		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
694		:param str or None user_agent: User agent for the HTTP requests.
695		Use None for the default.
696
697		:rtype: int
698		:return: Number of Google hits for the given search query.
699		"""
700
701		# Prepare domain list if it exists.
702		if domains:
703		domain_query = '+OR+'.join('site:' + domain for domain in domains)
704		domain_query = '+' + domain_query
705		else:
706		domain_query = ''
707
708		# Prepare the search string.
709		query = quote_plus(query + domain_query)
710
711		# Check extra_params for overlapping
712		for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
713		if builtin_param in extra_params.keys():
714		raise ValueError(
715		'GET parameter "%s" is overlapping with \
716		the built-in GET parameter',
717		builtin_param
718		)
719
720		# Grab the cookie from the home page.
721		get_page(url_home % vars(), user_agent)
722
723		# Prepare the URL of the first (and in this cases ONLY) request.
724		url = url_search % vars()
725
726		try: # Is it python<3?
727		iter_extra_params = extra_params.iteritems()
728		except AttributeError: # Or python>3?
729		iter_extra_params = extra_params.items()
730		# Append extra GET_parameters to URL
731		for k, v in iter_extra_params:
732		url += url + ('&%s=%s' % (k, v))
733
734		# Request the Google Search results page.
735		html = get_page(url, user_agent)
736
737		# Parse the response.
738		if is_bs4:
739		soup = BeautifulSoup(html, 'html.parser')
740		else:
741		soup = BeautifulSoup(html)
742
743		# Get the number of hits.
744		tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
745		hits_text_parts = tag.text.split()
746		if len(hits_text_parts) < 3:
747		return 0
748		return int(hits_text_parts[1].replace(',', '').replace('.', ''))
749
750
751		def ngd(term1, term2):
752		"""
753		Return the Normalized Google distance between words.
754
755		For more info, refer to:
756		https://en.wikipedia.org/wiki/Normalized_Google_distance
757
758		:param str term1: First term to compare.
759		:param str term2: Second term to compare.
760
761		:rtype: float
762		:return: Normalized Google distance between words.
763		"""
764
765		lhits1 = math.log10(hits(term1))
766		lhits2 = math.log10(hits(term2))
767		lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
768		npages = hits('the')
769		fix = 1000
770
771		lN = math.log10(npages * fix)
772		numerator = max([lhits1, lhits2]) - lhits_mix
773		denomin = lN - min([lhits1, lhits2])
774
775		return numerator / denomin
	375	return next(search(args, *kwargs))

+69

-55

scripts/google less more

0	0	#!/usr/bin/env python
1	1
2		# Python bindings to the Google search engine
3		# Copyright (c) 2009-2016, Mario Vilas
	2	# Copyright (c) 2009-2020, Mario Vilas
4	3	# All rights reserved.
5	4	#
6	5	# Redistribution and use in source and binary forms, with or without

31	30
32	31	from googlesearch import search, get_random_user_agent
33	32
	33	# TODO port to argparse
34	34	from optparse import OptionParser, IndentedHelpFormatter
	35
35	36
36	37	class BannerHelpFormatter(IndentedHelpFormatter):
37	38

45	46	msg = IndentedHelpFormatter.format_usage(self, usage)
46	47	return '%s\n%s' % (self.banner, msg)
47	48
48		# Parse the command line arguments.
49		formatter = BannerHelpFormatter(
50		"Python script to use the Google search engine\n"
51		"By Mario Vilas (mvilas at gmail dot com)\n"
52		"https://github.com/MarioVilas/googlesearch\n"
53		)
54		parser = OptionParser(formatter=formatter)
55		parser.set_usage("%prog [options] query")
56		parser.add_option("--tld", metavar="TLD", type="string", default="com",
57		help="top level domain to use [default: com]")
58		parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
59		help="produce results in the given language [default: en]")
60		parser.add_option("--domains", metavar="DOMAINS", type="string", default="",
61		help="comma separated list of domains to constrain the search to")
62		parser.add_option("--tbs", metavar="TBS", type="string", default="0",
63		help="produce results from period [default: 0]")
64		parser.add_option("--safe", metavar="SAFE", type="string", default="off",
65		help="kids safe search [default: off]")
66		parser.add_option("--num", metavar="NUMBER", type="int", default=10,
67		help="number of results per page [default: 10]")
68		parser.add_option("--start", metavar="NUMBER", type="int", default=0,
69		help="first result to retrieve [default: 0]")
70		parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
71		help="last result to retrieve [default: unlimited]")
72		parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
73		help="pause between HTTP requests [default: 2.0]")
74		parser.add_option("--rua", metavar="USERAGENT", action="store_true", default=False,
75		help="Randomize the User-Agent [default: no]")
76		parser.add_option("--all", dest="only_standard",
77		action="store_false", default=True,
78		help="grab all possible links from result pages [default: only standard results]")
79		(options, args) = parser.parse_args()
80		query = ' '.join(args)
81		if not query:
82		parser.print_help()
83		sys.exit(2)
84		params = [(k, v) for (k, v) in options.__dict__.items() if not k.startswith('_')]
85		params = dict(params)
86	49
87		# Split the comma separated list of domains, if present.
88		if 'domains' in params:
89		params['domains'] = [x.strip() for x in params['domains'].split(',')]
	50	def main():
90	51
91		# Randomize the user agent if requested.
92		if "rua" in params:
93		rua = params.pop("rua")
94		if rua:
95		params["user_agent"] = get_random_user_agent()
	52	# Parse the command line arguments.
	53	formatter = BannerHelpFormatter(
	54	"Python script to use the Google search engine\n"
	55	"By Mario Vilas (mvilas at gmail dot com)\n"
	56	"https://github.com/MarioVilas/googlesearch\n"
	57	)
	58	parser = OptionParser(formatter=formatter)
	59	parser.set_usage("%prog [options] query")
	60	parser.add_option(
	61	'--tld', metavar='TLD', type='string', default='com',
	62	help="top level domain to use [default: com]")
	63	parser.add_option(
	64	'--lang', metavar='LANGUAGE', type='string', default='en',
	65	help="produce results in the given language [default: en]")
	66	parser.add_option(
	67	'--tbs', metavar='TBS', type='string', default='0',
	68	help="produce results from period [default: 0]")
	69	parser.add_option(
	70	'--safe', metavar='SAFE', type='string', default='off',
	71	help="kids safe search [default: off]")
	72	parser.add_option(
	73	'--country', metavar='COUNTRY', type='string', default='',
	74	help="region to restrict search on [default: not restricted]")
	75	parser.add_option(
	76	'--num', metavar='NUMBER', type='int', default=10,
	77	help="number of results per page [default: 10]")
	78	parser.add_option(
	79	'--start', metavar='NUMBER', type='int', default=0,
	80	help="first result to retrieve [default: 0]")
	81	parser.add_option(
	82	'--stop', metavar='NUMBER', type='int', default=0,
	83	help="last result to retrieve [default: unlimited]")
	84	parser.add_option(
	85	'--pause', metavar='SECONDS', type='float', default=2.0,
	86	help="pause between HTTP requests [default: 2.0]")
	87	parser.add_option(
	88	'--rua', action='store_true', default=False,
	89	help="Randomize the User-Agent [default: no]")
	90	parser.add_option(
	91	'--insecure', dest="verify_ssl", action='store_false', default=True,
	92	help="Randomize the User-Agent [default: no]")
	93	(options, args) = parser.parse_args()
	94	query = ' '.join(args)
	95	if not query:
	96	parser.print_help()
	97	sys.exit(2)
	98	params = [
	99	(k, v) for (k, v) in options.__dict__.items()
	100	if not k.startswith('_')]
	101	params = dict(params)
96	102
97		# Run the query.
98		for url in search(query, **params):
99		print(url)
100		try:
101		sys.stdout.flush()
102		except:
103		pass
	103	# Randomize the user agent if requested.
	104	if 'rua' in params and params.pop('rua'):
	105	params['user_agent'] = get_random_user_agent()
	106
	107	# Run the query.
	108	for url in search(query, **params):
	109	print(url)
	110	try:
	111	sys.stdout.flush()
	112	except Exception:
	113	pass
	114
	115
	116	if __name__ == '__main__':
	117	main()

-8

setup.py less more

0	0	#!/usr/bin/env python
1	1
2		# Copyright (c) 2009-2019, Mario Vilas
	2	# Copyright (c) 2009-2020, Mario Vilas
3	3	# All rights reserved.
4	4	#
5	5	# Redistribution and use in source and binary forms, with or without

43	43	scripts=[join('scripts', 'google')],
44	44	package_data={'googlesearch': ['user_agents.txt.gz']},
45	45	include_package_data=True,
46		version="2.0.2",
	46	version="3.0.0",
47	47	description="Python bindings to the Google search engine.",
48	48	author="Mario Vilas",
49	49	author_email="[email protected]",

66	66	except ImportError:
67	67	from distutils.core import setup
68	68
69		# Get the long description from the readme file.
70		try:
71		metadata['long_description'] = open(join(here, 'README.md'), 'rU').read()
72		except Exception:
73		pass
74
75	69	# Run the setup script.
76	70	setup(**metadata)