Commit 19abad43bd613662beed3fc0d01a658c596b8d10 - python-googlesearch

Update upstream source from tag 'upstream/2.0.3' Update to upstream version '2.0.3' with Debian dir 23f01bae18c25b733c06069de12814c5368a6dfc Sophie Brun 3 years ago

5 changed file(s) with 214 addition(s) and 536 deletion(s). Raw diff Collapse all Expand all

-21

PKG-INFO less more

0	0	Metadata-Version: 1.1
1	1	Name: google
2		Version: 2.0.2
	2	Version: 2.0.3
3	3	Summary: Python bindings to the Google search engine.
4	4	Home-page: http://breakingcode.wordpress.com/
5	5	Author: Mario Vilas
6	6	Author-email: [email protected]
7	7	License: UNKNOWN
8		Description: googlesearch
9		============
10
11		Google search from Python.
12
13		https://python-googlesearch.readthedocs.io/en/latest/
14
15		Usage example
16		-------------
17
18		# Get the first 20 hits for: "Breaking Code" WordPress blog
19		from googlesearch import search
20		for url in search('"Breaking Code" WordPress blog', stop=20):
21		print(url)
22
23		Installing
24		----------
25
26		pip install google
27
	8	Description: UNKNOWN
28	9	Platform: UNKNOWN
29	10	Classifier: Development Status :: 5 - Production/Stable
30	11	Classifier: Intended Audience :: Developers

-21

google.egg-info/PKG-INFO less more

0	0	Metadata-Version: 1.1
1	1	Name: google
2		Version: 2.0.2
	2	Version: 2.0.3
3	3	Summary: Python bindings to the Google search engine.
4	4	Home-page: http://breakingcode.wordpress.com/
5	5	Author: Mario Vilas
6	6	Author-email: [email protected]
7	7	License: UNKNOWN
8		Description: googlesearch
9		============
10
11		Google search from Python.
12
13		https://python-googlesearch.readthedocs.io/en/latest/
14
15		Usage example
16		-------------
17
18		# Get the first 20 hits for: "Breaking Code" WordPress blog
19		from googlesearch import search
20		for url in search('"Breaking Code" WordPress blog', stop=20):
21		print(url)
22
23		Installing
24		----------
25
26		pip install google
27
	8	Description: UNKNOWN
28	9	Platform: UNKNOWN
29	10	Classifier: Development Status :: 5 - Production/Stable
30	11	Classifier: Intended Audience :: Developers

+114

-438

googlesearch/__init__.py less more

0	0	#!/usr/bin/env python
1	1
2	2	# Python bindings to the Google search engine
3		# Copyright (c) 2009-2018, Mario Vilas
	3	# Copyright (c) 2009-2019, Mario Vilas
4	4	# All rights reserved.
5	5	#
6	6	# Redistribution and use in source and binary forms, with or without

31	31	import random
32	32	import sys
33	33	import time
34		import math
35	34
36	35	if sys.version_info[0] > 2:
37	36	from http.cookiejar import LWPCookieJar

63	62	# Shortcut for "get lucky" search.
64	63	'lucky',
65	64
66		# Computations based on the number of Google hits.
67		'hits', 'ngd',
68
69	65	# Miscellaneous utility functions.
70		'get_random_user_agent',
	66	'get_random_user_agent', 'get_tbs',
71	67	]
72	68
73	69	# URL templates to make Google searches.
74	70	url_home = "https://www.google.%(tld)s/"
75	71	url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
76		"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
	72	"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
	73	"cr=%(country)s"
77	74	url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
78		"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
	75	"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
	76	"cr=%(country)s"
79	77	url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
80	78	"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
81		"tbm=%(tpe)s"
	79	"tbm=%(tpe)s&cr=%(country)s"
82	80	url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
83	81	"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
84		"safe=%(safe)s&tbm=%(tpe)s"
	82	"safe=%(safe)s&tbm=%(tpe)s&cr=%(country)s"
	83	url_parameters = (
	84	'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'tbm', 'cr')
85	85
86	86	# Cookie jar. Stored at the user's home folder.
	87	# If the cookie jar is inaccessible, the errors are ignored.
87	88	home_folder = os.getenv('HOME')
88	89	if not home_folder:
89	90	home_folder = os.getenv('USERHOME')

99	100	USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
100	101
101	102	# Load the list of valid user agents from the install folder.
	103	# The search order is:
	104	# * user_agents.txt.gz
	105	# * user_agents.txt
	106	# * default user agent
102	107	try:
103	108	install_folder = os.path.abspath(os.path.split(__file__)[0])
104	109	try:

129	134	return random.choice(user_agents_list)
130	135
131	136
	137	# Helper function to format the tbs parameter.
	138	def get_tbs(from_date, to_date):
	139	"""
	140	Helper function to format the tbs parameter.
	141
	142	:param datetime.date from_date: Python date object.
	143	:param datetime.date to_date: Python date object.
	144
	145	:rtype: str
	146	:return: Dates encoded in tbs format.
	147	"""
	148	from_date = from_date.strftime('%m/%d/%Y')
	149	to_date = to_date.strftime('%m/%d/%Y')
	150	return 'cdr:1,cd_min:%(from_date)s,cd_max:%(to_date)s' % vars()
	151
	152
132	153	# Request the given URL and return the response page, using the cookie jar.
	154	# If the cookie jar is inaccessible, the errors are ignored.
133	155	def get_page(url, user_agent=None):
134	156	"""
135	157	Request the given URL and return the response page, using the cookie jar.

166	188	def filter_result(link):
167	189	try:
168	190
169		# Valid results are absolute URLs not pointing to a Google domain
170		# like images.google.com or googleusercontent.com
	191	# Decode hidden URLs.
	192	if link.startswith('/url?'):
	193	o = urlparse(link, 'http')
	194	link = parse_qs(o.query)['q'][0]
	195
	196	# Valid results are absolute URLs not pointing to a Google domain,
	197	# like images.google.com or googleusercontent.com for example.
	198	# TODO this could be improved!
171	199	o = urlparse(link, 'http')
172	200	if o.netloc and 'google' not in o.netloc:
173	201	return link
174	202
175		# Decode hidden URLs.
176		if link.startswith('/url?'):
177		link = parse_qs(o.query)['q'][0]
178
179		# Valid results are absolute URLs not pointing to a Google domain
180		# like images.google.com or googleusercontent.com
181		o = urlparse(link, 'http')
182		if o.netloc and 'google' not in o.netloc:
183		return link
184
185		# Otherwise, or on error, return None.
	203	# On error, return None.
186	204	except Exception:
187	205	pass
188		return None
189	206
190	207
191	208	# Returns a generator that yields URLs.
192	209	def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
193		stop=None, domains=None, pause=2.0, only_standard=False,
194		extra_params={}, tpe='', user_agent=None):
	210	stop=None, domains=None, pause=2.0, tpe='', country='',
	211	extra_params=None, user_agent=None):
195	212	"""
196	213	Search the given query string using Google.
197	214

203	220	:param str safe: Safe search.
204	221	:param int num: Number of results per page.
205	222	:param int start: First result to retrieve.
206		:param int or None stop: Last result to retrieve.
	223	:param int stop: Last result to retrieve.
207	224	Use None to keep searching forever.
208		:param list of str or None domains: A list of web domains to constrain
	225	:param list domains: A list of web domains to constrain
209	226	the search.
210	227	:param float pause: Lapse to wait between HTTP requests.
211	228	A lapse too long will make the search slow, but a lapse too short may
212	229	cause Google to block your IP. Your mileage may vary!
213		:param bool only_standard: If True, only returns the standard results from
214		each page. If False, it returns every possible link from each page,
215		except for those that point back to Google itself. Defaults to False
216		for backwards compatibility with older versions of this module.
217		:param dict of str to str extra_params: A dictionary of extra HTTP GET
	230	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	231	Use the following values {videos: 'vid', images: 'isch',
	232	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	233	:param str country: Country or region to focus the search on. Similar to
	234	changing the TLD, but does not yield exactly the same results.
	235	Only Google knows why...
	236	:param dict extra_params: A dictionary of extra HTTP GET
218	237	parameters, which must be URL encoded. For example if you don't want
219	238	Google to filter similar results you can set the extra_params to
220	239	{'filter': '0'} which will append '&filter=0' to every query.
221		:param str tpe: Search type (images, videos, news, shopping, books, apps)
222		Use the following values {videos: 'vid', images: 'isch',
223		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
224		:param str or None user_agent: User agent for the HTTP requests.
	240	:param str user_agent: User agent for the HTTP requests.
225	241	Use None for the default.
226	242
227	243	:rtype: generator of str

232	248	# This is used to avoid repeated results.
233	249	hashes = set()
234	250
235		# Count the number of links yielded
	251	# Count the number of links yielded.
236	252	count = 0
237	253
238	254	# Prepare domain list if it exists.

243	259	# Prepare the search string.
244	260	query = quote_plus(query)
245	261
246		# Check extra_params for overlapping
247		for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
	262	# If no extra_params is given, create an empty dictionary.
	263	# We should avoid using an empty dictionary as a default value
	264	# in a function parameter in Python.
	265	if not extra_params:
	266	extra_params = {}
	267
	268	# Check extra_params for overlapping.
	269	for builtin_param in url_parameters:
248	270	if builtin_param in extra_params.keys():
249	271	raise ValueError(
250	272	'GET parameter "%s" is overlapping with \

269	291
270	292	# Loop until we reach the maximum result, if any (otherwise, loop forever).
271	293	while not stop or count < stop:
272		# Remeber last count to detect the end of results
	294
	295	# Remeber last count to detect the end of results.
273	296	last_count = count
274	297
275		try: # Is it python<3?
276		iter_extra_params = extra_params.iteritems()
277		except AttributeError: # Or python>3?
278		iter_extra_params = extra_params.items()
279		# Append extra GET_parameters to URL
280		for k, v in iter_extra_params:
281		url += url + ('&%s=%s' % (k, v))
	298	# Append extra GET parameters to the URL.
	299	# This is done on every iteration because we're
	300	# rebuilding the entire URL at the end of this loop.
	301	for k, v in extra_params.items():
	302	k = quote_plus(k)
	303	v = quote_plus(v)
	304	url = url + ('&%s=%s' % (k, v))
282	305
283	306	# Sleep between requests.
	307	# Keeps Google from banning you for making too many requests.
284	308	time.sleep(pause)
285	309
286	310	# Request the Google Search results page.
287	311	html = get_page(url, user_agent)
288	312
289		# Parse the response and process every anchored URL.
	313	# Parse the response and get every anchored URL.
290	314	if is_bs4:
291	315	soup = BeautifulSoup(html, 'html.parser')
292	316	else:

294	318	try:
295	319	anchors = soup.find(id='search').findAll('a')
296	320	# Sometimes (depending on the User-agent) there is
297		# no id "search" in html response
	321	# no id "search" in html response...
298	322	except AttributeError:
299		# Remove links of the top bar
	323	# Remove links of the top bar.
300	324	gbar = soup.find(id='gbar')
301	325	if gbar:
302	326	gbar.clear()
303	327	anchors = soup.findAll('a')
	328
	329	# Process every anchored URL.
304	330	for a in anchors:
305
306		# Leave only the "standard" results if requested.
307		# Otherwise grab all possible links.
308		if only_standard and (
309		not a.parent or a.parent.name.lower() != "h3"):
310		continue
311	331
312	332	# Get the URL from the anchor tag.
313	333	try:

329	349	# Yield the result.
330	350	yield link
331	351
	352	# Increase the results counter.
	353	# If we reached the limit, stop.
332	354	count += 1
333	355	if stop and count >= stop:
334	356	return
335	357
336	358	# End if there are no more results.
	359	# XXX TODO review this logic, not sure if this is still true!
337	360	if last_count == count:
338	361	break
339	362

347	370
348	371	# Shortcut to search images.
349	372	# Beware, this does not return the image link.
350		def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
351		start=0, stop=None, pause=2.0, domains=None,
352		only_standard=False, extra_params={}):
	373	def search_images(args, *kwargs):
353	374	"""
354	375	Shortcut to search images.
355	376
	377	Same arguments and return value as the main search function.
	378
356	379	:note: Beware, this does not return the image link.
357
358		:param str query: Query string. Must NOT be url-encoded.
359		:param str tld: Top level domain.
360		:param str lang: Language.
361		:param str tbs: Time limits (i.e "qdr:h" => last hour,
362		"qdr:d" => last 24 hours, "qdr:m" => last month).
363		:param str safe: Safe search.
364		:param int num: Number of results per page.
365		:param int start: First result to retrieve.
366		:param int or None stop: Last result to retrieve.
367		Use None to keep searching forever.
368		:param list of str or None domains: A list of web domains to constrain
369		the search.
370		:param float pause: Lapse to wait between HTTP requests.
371		A lapse too long will make the search slow, but a lapse too short may
372		cause Google to block your IP. Your mileage may vary!
373		:param bool only_standard: If True, only returns the standard results from
374		each page. If False, it returns every possible link from each page,
375		except for those that point back to Google itself. Defaults to False
376		for backwards compatibility with older versions of this module.
377		:param dict of str to str extra_params: A dictionary of extra HTTP GET
378		parameters, which must be URL encoded. For example if you don't want
379		Google to filter similar results you can set the extra_params to
380		{'filter': '0'} which will append '&filter=0' to every query.
381		:param str tpe: Search type (images, videos, news, shopping, books, apps)
382		Use the following values {videos: 'vid', images: 'isch',
383		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
384		:param str or None user_agent: User agent for the HTTP requests.
385		Use None for the default.
386
387		:rtype: generator of str
388		:return: Generator (iterator) that yields found URLs.
389		If the stop parameter is None the iterator will loop forever.
390		"""
391		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
392		pause, only_standard, extra_params, tpe='isch')
	380	"""
	381	kwargs['tpe'] = 'isch'
	382	return search(args, *kwargs)
393	383
394	384
395	385	# Shortcut to search news.
396		def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
397		start=0, stop=None, domains=None, pause=2.0,
398		only_standard=False, extra_params={}):
	386	def search_news(args, *kwargs):
399	387	"""
400	388	Shortcut to search news.
401	389
402		:param str query: Query string. Must NOT be url-encoded.
403		:param str tld: Top level domain.
404		:param str lang: Language.
405		:param str tbs: Time limits (i.e "qdr:h" => last hour,
406		"qdr:d" => last 24 hours, "qdr:m" => last month).
407		:param str safe: Safe search.
408		:param int num: Number of results per page.
409		:param int start: First result to retrieve.
410		:param int or None stop: Last result to retrieve.
411		Use None to keep searching forever.
412		:param list of str or None domains: A list of web domains to constrain
413		the search.
414		:param float pause: Lapse to wait between HTTP requests.
415		A lapse too long will make the search slow, but a lapse too short may
416		cause Google to block your IP. Your mileage may vary!
417		:param bool only_standard: If True, only returns the standard results from
418		each page. If False, it returns every possible link from each page,
419		except for those that point back to Google itself. Defaults to False
420		for backwards compatibility with older versions of this module.
421		:param dict of str to str extra_params: A dictionary of extra HTTP GET
422		parameters, which must be URL encoded. For example if you don't want
423		Google to filter similar results you can set the extra_params to
424		{'filter': '0'} which will append '&filter=0' to every query.
425		:param str tpe: Search type (images, videos, news, shopping, books, apps)
426		Use the following values {videos: 'vid', images: 'isch',
427		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
428		:param str or None user_agent: User agent for the HTTP requests.
429		Use None for the default.
430
431		:rtype: generator of str
432		:return: Generator (iterator) that yields found URLs.
433		If the stop parameter is None the iterator will loop forever.
434		"""
435		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
436		pause, only_standard, extra_params, tpe='nws')
	390	Same arguments and return value as the main search function.
	391	"""
	392	kwargs['tpe'] = 'nws'
	393	return search(args, *kwargs)
437	394
438	395
439	396	# Shortcut to search videos.
440		def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
441		start=0, stop=None, domains=None, pause=2.0,
442		only_standard=False, extra_params={}):
	397	def search_videos(args, *kwargs):
443	398	"""
444	399	Shortcut to search videos.
445	400
446		:param str query: Query string. Must NOT be url-encoded.
447		:param str tld: Top level domain.
448		:param str lang: Language.
449		:param str tbs: Time limits (i.e "qdr:h" => last hour,
450		"qdr:d" => last 24 hours, "qdr:m" => last month).
451		:param str safe: Safe search.
452		:param int num: Number of results per page.
453		:param int start: First result to retrieve.
454		:param int or None stop: Last result to retrieve.
455		Use None to keep searching forever.
456		:param list of str or None domains: A list of web domains to constrain
457		the search.
458		:param float pause: Lapse to wait between HTTP requests.
459		A lapse too long will make the search slow, but a lapse too short may
460		cause Google to block your IP. Your mileage may vary!
461		:param bool only_standard: If True, only returns the standard results from
462		each page. If False, it returns every possible link from each page,
463		except for those that point back to Google itself. Defaults to False
464		for backwards compatibility with older versions of this module.
465		:param dict of str to str extra_params: A dictionary of extra HTTP GET
466		parameters, which must be URL encoded. For example if you don't want
467		Google to filter similar results you can set the extra_params to
468		{'filter': '0'} which will append '&filter=0' to every query.
469		:param str tpe: Search type (images, videos, news, shopping, books, apps)
470		Use the following values {videos: 'vid', images: 'isch',
471		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
472		:param str or None user_agent: User agent for the HTTP requests.
473		Use None for the default.
474
475		:rtype: generator of str
476		:return: Generator (iterator) that yields found URLs.
477		If the stop parameter is None the iterator will loop forever.
478		"""
479		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
480		pause, only_standard, extra_params, tpe='vid')
	401	Same arguments and return value as the main search function.
	402	"""
	403	kwargs['tpe'] = 'vid'
	404	return search(args, *kwargs)
481	405
482	406
483	407	# Shortcut to search shop.
484		def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
485		start=0, stop=None, domains=None, pause=2.0,
486		only_standard=False, extra_params={}):
	408	def search_shop(args, *kwargs):
487	409	"""
488	410	Shortcut to search shop.
489	411
490		:param str query: Query string. Must NOT be url-encoded.
491		:param str tld: Top level domain.
492		:param str lang: Language.
493		:param str tbs: Time limits (i.e "qdr:h" => last hour,
494		"qdr:d" => last 24 hours, "qdr:m" => last month).
495		:param str safe: Safe search.
496		:param int num: Number of results per page.
497		:param int start: First result to retrieve.
498		:param int or None stop: Last result to retrieve.
499		Use None to keep searching forever.
500		:param list of str or None domains: A list of web domains to constrain
501		the search.
502		:param float pause: Lapse to wait between HTTP requests.
503		A lapse too long will make the search slow, but a lapse too short may
504		cause Google to block your IP. Your mileage may vary!
505		:param bool only_standard: If True, only returns the standard results from
506		each page. If False, it returns every possible link from each page,
507		except for those that point back to Google itself. Defaults to False
508		for backwards compatibility with older versions of this module.
509		:param dict of str to str extra_params: A dictionary of extra HTTP GET
510		parameters, which must be URL encoded. For example if you don't want
511		Google to filter similar results you can set the extra_params to
512		{'filter': '0'} which will append '&filter=0' to every query.
513		:param str tpe: Search type (images, videos, news, shopping, books, apps)
514		Use the following values {videos: 'vid', images: 'isch',
515		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
516		:param str or None user_agent: User agent for the HTTP requests.
517		Use None for the default.
518
519		:rtype: generator of str
520		:return: Generator (iterator) that yields found URLs.
521		If the stop parameter is None the iterator will loop forever.
522		"""
523		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
524		pause, only_standard, extra_params, tpe='shop')
	412	Same arguments and return value as the main search function.
	413	"""
	414	kwargs['tpe'] = 'shop'
	415	return search(args, *kwargs)
525	416
526	417
527	418	# Shortcut to search books.
528		def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
529		start=0, stop=None, domains=None, pause=2.0,
530		only_standard=False, extra_params={}):
	419	def search_books(args, *kwargs):
531	420	"""
532	421	Shortcut to search books.
533	422
534		:param str query: Query string. Must NOT be url-encoded.
535		:param str tld: Top level domain.
536		:param str lang: Language.
537		:param str tbs: Time limits (i.e "qdr:h" => last hour,
538		"qdr:d" => last 24 hours, "qdr:m" => last month).
539		:param str safe: Safe search.
540		:param int num: Number of results per page.
541		:param int start: First result to retrieve.
542		:param int or None stop: Last result to retrieve.
543		Use None to keep searching forever.
544		:param list of str or None domains: A list of web domains to constrain
545		the search.
546		:param float pause: Lapse to wait between HTTP requests.
547		A lapse too long will make the search slow, but a lapse too short may
548		cause Google to block your IP. Your mileage may vary!
549		:param bool only_standard: If True, only returns the standard results from
550		each page. If False, it returns every possible link from each page,
551		except for those that point back to Google itself. Defaults to False
552		for backwards compatibility with older versions of this module.
553		:param dict of str to str extra_params: A dictionary of extra HTTP GET
554		parameters, which must be URL encoded. For example if you don't want
555		Google to filter similar results you can set the extra_params to
556		{'filter': '0'} which will append '&filter=0' to every query.
557		:param str tpe: Search type (images, videos, news, shopping, books, apps)
558		Use the following values {videos: 'vid', images: 'isch',
559		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
560		:param str or None user_agent: User agent for the HTTP requests.
561		Use None for the default.
562
563		:rtype: generator of str
564		:return: Generator (iterator) that yields found URLs.
565		If the stop parameter is None the iterator will loop forever.
566		"""
567		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
568		pause, only_standard, extra_params, tpe='bks')
	423	Same arguments and return value as the main search function.
	424	"""
	425	kwargs['tpe'] = 'bks'
	426	return search(args, *kwargs)
569	427
570	428
571	429	# Shortcut to search apps.
572		def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
573		start=0, stop=None, domains=None, pause=2.0,
574		only_standard=False, extra_params={}):
	430	def search_apps(args, *kwargs):
575	431	"""
576	432	Shortcut to search apps.
577	433
578		:param str query: Query string. Must NOT be url-encoded.
579		:param str tld: Top level domain.
580		:param str lang: Language.
581		:param str tbs: Time limits (i.e "qdr:h" => last hour,
582		"qdr:d" => last 24 hours, "qdr:m" => last month).
583		:param str safe: Safe search.
584		:param int num: Number of results per page.
585		:param int start: First result to retrieve.
586		:param int or None stop: Last result to retrieve.
587		Use None to keep searching forever.
588		:param list of str or None domains: A list of web domains to constrain
589		the search.
590		:param float pause: Lapse to wait between HTTP requests.
591		A lapse too long will make the search slow, but a lapse too short may
592		cause Google to block your IP. Your mileage may vary!
593		:param bool only_standard: If True, only returns the standard results from
594		each page. If False, it returns every possible link from each page,
595		except for those that point back to Google itself. Defaults to False
596		for backwards compatibility with older versions of this module.
597		:param dict of str to str extra_params: A dictionary of extra HTTP GET
598		parameters, which must be URL encoded. For example if you don't want
599		Google to filter similar results you can set the extra_params to
600		{'filter': '0'} which will append '&filter=0' to every query.
601		:param str tpe: Search type (images, videos, news, shopping, books, apps)
602		Use the following values {videos: 'vid', images: 'isch',
603		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
604		:param str or None user_agent: User agent for the HTTP requests.
605		Use None for the default.
606
607		:rtype: generator of str
608		:return: Generator (iterator) that yields found URLs.
609		If the stop parameter is None the iterator will loop forever.
610		"""
611		return search(query, tld, lang, tbs, safe, num, start, stop, domains,
612		pause, only_standard, extra_params, tpe='app')
	434	Same arguments and return value as the main search function.
	435	"""
	436	kwargs['tpe'] = 'app'
	437	return search(args, *kwargs)
613	438
614	439
615	440	# Shortcut to single-item search.
616	441	# Evaluates the iterator to return the single URL as a string.
617		def lucky(query, tld='com', lang='en', tbs='0', safe='off',
618		only_standard=False, extra_params={}, tpe=''):
	442	def lucky(args, *kwargs):
619	443	"""
620	444	Shortcut to single-item search.
621	445
622		:param str query: Query string. Must NOT be url-encoded.
623		:param str tld: Top level domain.
624		:param str lang: Language.
625		:param str tbs: Time limits (i.e "qdr:h" => last hour,
626		"qdr:d" => last 24 hours, "qdr:m" => last month).
627		:param str safe: Safe search.
628		:param int num: Number of results per page.
629		:param int start: First result to retrieve.
630		:param int or None stop: Last result to retrieve.
631		Use None to keep searching forever.
632		:param list of str or None domains: A list of web domains to constrain
633		the search.
634		:param float pause: Lapse to wait between HTTP requests.
635		A lapse too long will make the search slow, but a lapse too short may
636		cause Google to block your IP. Your mileage may vary!
637		:param bool only_standard: If True, only returns the standard results from
638		each page. If False, it returns every possible link from each page,
639		except for those that point back to Google itself. Defaults to False
640		for backwards compatibility with older versions of this module.
641		:param dict of str to str extra_params: A dictionary of extra HTTP GET
642		parameters, which must be URL encoded. For example if you don't want
643		Google to filter similar results you can set the extra_params to
644		{'filter': '0'} which will append '&filter=0' to every query.
645		:param str tpe: Search type (images, videos, news, shopping, books, apps)
646		Use the following values {videos: 'vid', images: 'isch',
647		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
648		:param str or None user_agent: User agent for the HTTP requests.
649		Use None for the default.
	446	Same arguments as the main search function, but the return value changes.
650	447
651	448	:rtype: str
652	449	:return: URL found by Google.
653	450	"""
654		gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
655		extra_params, tpe)
656		return next(gen)
657
658
659		# Returns only the number of Google hits for the given search query.
660		# This is the number reported by Google itself, NOT by scraping.
661		def hits(query, tld='com', lang='en', tbs='0', safe='off',
662		domains=None, extra_params={}, tpe='', user_agent=None):
663		"""
664		Search the given query string using Google and return the number of hits.
665
666		:note: This is the number reported by Google itself, NOT by scraping.
667
668		:param str query: Query string. Must NOT be url-encoded.
669		:param str tld: Top level domain.
670		:param str lang: Language.
671		:param str tbs: Time limits (i.e "qdr:h" => last hour,
672		"qdr:d" => last 24 hours, "qdr:m" => last month).
673		:param str safe: Safe search.
674		:param int num: Number of results per page.
675		:param int start: First result to retrieve.
676		:param int or None stop: Last result to retrieve.
677		Use None to keep searching forever.
678		:param list of str or None domains: A list of web domains to constrain
679		the search.
680		:param float pause: Lapse to wait between HTTP requests.
681		A lapse too long will make the search slow, but a lapse too short may
682		cause Google to block your IP. Your mileage may vary!
683		:param bool only_standard: If True, only returns the standard results from
684		each page. If False, it returns every possible link from each page,
685		except for those that point back to Google itself. Defaults to False
686		for backwards compatibility with older versions of this module.
687		:param dict of str to str extra_params: A dictionary of extra HTTP GET
688		parameters, which must be URL encoded. For example if you don't want
689		Google to filter similar results you can set the extra_params to
690		{'filter': '0'} which will append '&filter=0' to every query.
691		:param str tpe: Search type (images, videos, news, shopping, books, apps)
692		Use the following values {videos: 'vid', images: 'isch',
693		news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
694		:param str or None user_agent: User agent for the HTTP requests.
695		Use None for the default.
696
697		:rtype: int
698		:return: Number of Google hits for the given search query.
699		"""
700
701		# Prepare domain list if it exists.
702		if domains:
703		domain_query = '+OR+'.join('site:' + domain for domain in domains)
704		domain_query = '+' + domain_query
705		else:
706		domain_query = ''
707
708		# Prepare the search string.
709		query = quote_plus(query + domain_query)
710
711		# Check extra_params for overlapping
712		for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
713		if builtin_param in extra_params.keys():
714		raise ValueError(
715		'GET parameter "%s" is overlapping with \
716		the built-in GET parameter',
717		builtin_param
718		)
719
720		# Grab the cookie from the home page.
721		get_page(url_home % vars(), user_agent)
722
723		# Prepare the URL of the first (and in this cases ONLY) request.
724		url = url_search % vars()
725
726		try: # Is it python<3?
727		iter_extra_params = extra_params.iteritems()
728		except AttributeError: # Or python>3?
729		iter_extra_params = extra_params.items()
730		# Append extra GET_parameters to URL
731		for k, v in iter_extra_params:
732		url += url + ('&%s=%s' % (k, v))
733
734		# Request the Google Search results page.
735		html = get_page(url, user_agent)
736
737		# Parse the response.
738		if is_bs4:
739		soup = BeautifulSoup(html, 'html.parser')
740		else:
741		soup = BeautifulSoup(html)
742
743		# Get the number of hits.
744		tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
745		hits_text_parts = tag.text.split()
746		if len(hits_text_parts) < 3:
747		return 0
748		return int(hits_text_parts[1].replace(',', '').replace('.', ''))
749
750
751		def ngd(term1, term2):
752		"""
753		Return the Normalized Google distance between words.
754
755		For more info, refer to:
756		https://en.wikipedia.org/wiki/Normalized_Google_distance
757
758		:param str term1: First term to compare.
759		:param str term2: Second term to compare.
760
761		:rtype: float
762		:return: Normalized Google distance between words.
763		"""
764
765		lhits1 = math.log10(hits(term1))
766		lhits2 = math.log10(hits(term2))
767		lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
768		npages = hits('the')
769		fix = 1000
770
771		lN = math.log10(npages * fix)
772		numerator = max([lhits1, lhits2]) - lhits_mix
773		denomin = lN - min([lhits1, lhits2])
774
775		return numerator / denomin
	451	return next(search(args, *kwargs))

+87

-54

scripts/google less more

0	0	#!/usr/bin/env python
1	1
2	2	# Python bindings to the Google search engine
3		# Copyright (c) 2009-2016, Mario Vilas
	3	# Copyright (c) 2009-2019, Mario Vilas
4	4	# All rights reserved.
5	5	#
6	6	# Redistribution and use in source and binary forms, with or without

31	31
32	32	from googlesearch import search, get_random_user_agent
33	33
	34	# TODO port to argparse
34	35	from optparse import OptionParser, IndentedHelpFormatter
	36
35	37
36	38	class BannerHelpFormatter(IndentedHelpFormatter):
37	39

45	47	msg = IndentedHelpFormatter.format_usage(self, usage)
46	48	return '%s\n%s' % (self.banner, msg)
47	49
48		# Parse the command line arguments.
49		formatter = BannerHelpFormatter(
50		"Python script to use the Google search engine\n"
51		"By Mario Vilas (mvilas at gmail dot com)\n"
52		"https://github.com/MarioVilas/googlesearch\n"
53		)
54		parser = OptionParser(formatter=formatter)
55		parser.set_usage("%prog [options] query")
56		parser.add_option("--tld", metavar="TLD", type="string", default="com",
57		help="top level domain to use [default: com]")
58		parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
59		help="produce results in the given language [default: en]")
60		parser.add_option("--domains", metavar="DOMAINS", type="string", default="",
61		help="comma separated list of domains to constrain the search to")
62		parser.add_option("--tbs", metavar="TBS", type="string", default="0",
63		help="produce results from period [default: 0]")
64		parser.add_option("--safe", metavar="SAFE", type="string", default="off",
65		help="kids safe search [default: off]")
66		parser.add_option("--num", metavar="NUMBER", type="int", default=10,
67		help="number of results per page [default: 10]")
68		parser.add_option("--start", metavar="NUMBER", type="int", default=0,
69		help="first result to retrieve [default: 0]")
70		parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
71		help="last result to retrieve [default: unlimited]")
72		parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
73		help="pause between HTTP requests [default: 2.0]")
74		parser.add_option("--rua", metavar="USERAGENT", action="store_true", default=False,
75		help="Randomize the User-Agent [default: no]")
76		parser.add_option("--all", dest="only_standard",
77		action="store_false", default=True,
78		help="grab all possible links from result pages [default: only standard results]")
79		(options, args) = parser.parse_args()
80		query = ' '.join(args)
81		if not query:
82		parser.print_help()
83		sys.exit(2)
84		params = [(k, v) for (k, v) in options.__dict__.items() if not k.startswith('_')]
85		params = dict(params)
86	50
87		# Split the comma separated list of domains, if present.
88		if 'domains' in params:
89		params['domains'] = [x.strip() for x in params['domains'].split(',')]
	51	def main():
90	52
91		# Randomize the user agent if requested.
92		if "rua" in params:
93		rua = params.pop("rua")
94		if rua:
95		params["user_agent"] = get_random_user_agent()
	53	# Parse the command line arguments.
	54	formatter = BannerHelpFormatter(
	55	"Python script to use the Google search engine\n"
	56	"By Mario Vilas (mvilas at gmail dot com)\n"
	57	"https://github.com/MarioVilas/googlesearch\n"
	58	)
	59	parser = OptionParser(formatter=formatter)
	60	parser.set_usage("%prog [options] query")
	61	parser.add_option(
	62	'--tld', metavar='TLD', type='string', default='com',
	63	help="top level domain to use [default: com]")
	64	parser.add_option(
	65	'--lang', metavar='LANGUAGE', type='string', default='en',
	66	help="produce results in the given language [default: en]")
	67	parser.add_option(
	68	'--domains', metavar='DOMAINS', type='string', default='',
	69	help="comma separated list of domains to constrain the search to")
	70	parser.add_option(
	71	'--tbs', metavar='TBS', type='string', default='0',
	72	help="produce results from period [default: 0]")
	73	parser.add_option(
	74	'--safe', metavar='SAFE', type='string', default='off',
	75	help="kids safe search [default: off]")
	76	parser.add_option(
	77	'--type', metavar='TYPE', type='string', default='search', dest='tpe',
	78	help="search type (search, images, videos, news, shopping, books,"
	79	" apps) [default: search]")
	80	parser.add_option(
	81	'--country', metavar='COUNTRY', type='string', default='',
	82	help="region to restrict search on [default: not restricted]")
	83	parser.add_option(
	84	'--num', metavar='NUMBER', type='int', default=10,
	85	help="number of results per page [default: 10]")
	86	parser.add_option(
	87	'--start', metavar='NUMBER', type='int', default=0,
	88	help="first result to retrieve [default: 0]")
	89	parser.add_option(
	90	'--stop', metavar='NUMBER', type='int', default=0,
	91	help="last result to retrieve [default: unlimited]")
	92	parser.add_option(
	93	'--pause', metavar='SECONDS', type='float', default=2.0,
	94	help="pause between HTTP requests [default: 2.0]")
	95	parser.add_option(
	96	'--rua', metavar='USERAGENT', action='store_true', default=False,
	97	help="Randomize the User-Agent [default: no]")
	98	(options, args) = parser.parse_args()
	99	query = ' '.join(args)
	100	if not query:
	101	parser.print_help()
	102	sys.exit(2)
	103	params = [
	104	(k, v) for (k, v) in options.__dict__.items()
	105	if not k.startswith('_')]
	106	params = dict(params)
96	107
97		# Run the query.
98		for url in search(query, **params):
99		print(url)
100		try:
101		sys.stdout.flush()
102		except:
103		pass
	108	# Split the comma separated list of domains, if present.
	109	if 'domains' in params:
	110	params['domains'] = [x.strip() for x in params['domains'].split(',')]
	111
	112	# Use a special search type if requested.
	113	if 'tpe' in params:
	114	tpe = params['tpe']
	115	if tpe and tpe not in (
	116	'search', 'images', 'videos', 'news',
	117	'shopping', 'books', 'apps'):
	118	parser.error("invalid type: %r" % tpe)
	119	if tpe == 'search':
	120	params['tpe'] = ''
	121
	122	# Randomize the user agent if requested.
	123	if 'rua' in params and params.pop('rua'):
	124	params['user_agent'] = get_random_user_agent()
	125
	126	# Run the query.
	127	for url in search(query, **params):
	128	print(url)
	129	try:
	130	sys.stdout.flush()
	131	except Exception:
	132	pass
	133
	134
	135	if __name__ == '__main__':
	136	main()

-2

setup.py less more

43	43	scripts=[join('scripts', 'google')],
44	44	package_data={'googlesearch': ['user_agents.txt.gz']},
45	45	include_package_data=True,
46		version="2.0.2",
	46	version="2.0.3",
47	47	description="Python bindings to the Google search engine.",
48	48	author="Mario Vilas",
49	49	author_email="[email protected]",

65	65	metadata['install_requires'] = metadata['requires']
66	66	except ImportError:
67	67	from distutils.core import setup
68
	68	"""
69	69	# Get the long description from the readme file.
70	70	try:
71	71	metadata['long_description'] = open(join(here, 'README.md'), 'rU').read()
72	72	except Exception:
73	73	pass
74	74
	75	# If twine is installed, set the long description content type.
	76	try:
	77	import twine
	78	metadata['long_description_content_type'] = 'text/markdown'
	79	except ImportError:
	80	pass
	81	"""
75	82	# Run the setup script.
76	83	setup(**metadata)