Commit upstream/2.0.2 - python-googlesearch

New upstream version 2.0.2 Sophie Brun 4 years ago

14 changed file(s) with 1080 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all

-0

MANIFEST.in less more

	0	include README.md
	1	include MANIFEST.in
	2	include setup.py
	3	include scripts/google
	4	include requirements.txt
	5	include googlesearch/user_agents.txt.gz

+37

-0

PKG-INFO less more

	0	Metadata-Version: 1.1
	1	Name: google
	2	Version: 2.0.2
	3	Summary: Python bindings to the Google search engine.
	4	Home-page: http://breakingcode.wordpress.com/
	5	Author: Mario Vilas
	6	Author-email: [email protected]
	7	License: UNKNOWN
	8	Description: googlesearch
	9	============
	10
	11	Google search from Python.
	12
	13	https://python-googlesearch.readthedocs.io/en/latest/
	14
	15	Usage example
	16	-------------
	17
	18	# Get the first 20 hits for: "Breaking Code" WordPress blog
	19	from googlesearch import search
	20	for url in search('"Breaking Code" WordPress blog', stop=20):
	21	print(url)
	22
	23	Installing
	24	----------
	25
	26	pip install google
	27
	28	Platform: UNKNOWN
	29	Classifier: Development Status :: 5 - Production/Stable
	30	Classifier: Intended Audience :: Developers
	31	Classifier: License :: OSI Approved :: BSD License
	32	Classifier: Environment :: Console
	33	Classifier: Programming Language :: Python
	34	Classifier: Topic :: Software Development :: Libraries :: Python Modules
	35	Requires: beautifulsoup4
	36	Provides: googlesearch

+19

-0

README.md less more

	0	googlesearch
	1	============
	2
	3	Google search from Python.
	4
	5	https://python-googlesearch.readthedocs.io/en/latest/
	6
	7	Usage example
	8	-------------
	9
	10	# Get the first 20 hits for: "Breaking Code" WordPress blog
	11	from googlesearch import search
	12	for url in search('"Breaking Code" WordPress blog', stop=20):
	13	print(url)
	14
	15	Installing
	16	----------
	17
	18	pip install google

+37

-0

google.egg-info/PKG-INFO less more

	0	Metadata-Version: 1.1
	1	Name: google
	2	Version: 2.0.2
	3	Summary: Python bindings to the Google search engine.
	4	Home-page: http://breakingcode.wordpress.com/
	5	Author: Mario Vilas
	6	Author-email: [email protected]
	7	License: UNKNOWN
	8	Description: googlesearch
	9	============
	10
	11	Google search from Python.
	12
	13	https://python-googlesearch.readthedocs.io/en/latest/
	14
	15	Usage example
	16	-------------
	17
	18	# Get the first 20 hits for: "Breaking Code" WordPress blog
	19	from googlesearch import search
	20	for url in search('"Breaking Code" WordPress blog', stop=20):
	21	print(url)
	22
	23	Installing
	24	----------
	25
	26	pip install google
	27
	28	Platform: UNKNOWN
	29	Classifier: Development Status :: 5 - Production/Stable
	30	Classifier: Intended Audience :: Developers
	31	Classifier: License :: OSI Approved :: BSD License
	32	Classifier: Environment :: Console
	33	Classifier: Programming Language :: Python
	34	Classifier: Topic :: Software Development :: Libraries :: Python Modules
	35	Requires: beautifulsoup4
	36	Provides: googlesearch

+13

-0

google.egg-info/SOURCES.txt less more

	0	MANIFEST.in
	1	README.md
	2	requirements.txt
	3	setup.cfg
	4	setup.py
	5	google.egg-info/PKG-INFO
	6	google.egg-info/SOURCES.txt
	7	google.egg-info/dependency_links.txt
	8	google.egg-info/requires.txt
	9	google.egg-info/top_level.txt
	10	googlesearch/__init__.py
	11	googlesearch/user_agents.txt.gz
	12	scripts/google⏎

-0

google.egg-info/dependency_links.txt less more

-0

google.egg-info/requires.txt less more

beautifulsoup4

-0

google.egg-info/top_level.txt less more

googlesearch

+776

-0

googlesearch/__init__.py less more

	0	#!/usr/bin/env python
	1
	2	# Python bindings to the Google search engine
	3	# Copyright (c) 2009-2018, Mario Vilas
	4	# All rights reserved.
	5	#
	6	# Redistribution and use in source and binary forms, with or without
	7	# modification, are permitted provided that the following conditions are met:
	8	#
	9	# * Redistributions of source code must retain the above copyright notice,
	10	# this list of conditions and the following disclaimer.
	11	# * Redistributions in binary form must reproduce the above copyright
	12	# notice,this list of conditions and the following disclaimer in the
	13	# documentation and/or other materials provided with the distribution.
	14	# * Neither the name of the copyright holder nor the names of its
	15	# contributors may be used to endorse or promote products derived from
	16	# this software without specific prior written permission.
	17	#
	18	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	19	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	20	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	21	# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	22	# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	23	# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	24	# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	25	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	26	# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	27	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	28	# POSSIBILITY OF SUCH DAMAGE.
	29
	30	import os
	31	import random
	32	import sys
	33	import time
	34	import math
	35
	36	if sys.version_info[0] > 2:
	37	from http.cookiejar import LWPCookieJar
	38	from urllib.request import Request, urlopen
	39	from urllib.parse import quote_plus, urlparse, parse_qs
	40	else:
	41	from cookielib import LWPCookieJar
	42	from urllib import quote_plus
	43	from urllib2 import Request, urlopen
	44	from urlparse import urlparse, parse_qs
	45
	46	try:
	47	from bs4 import BeautifulSoup
	48	is_bs4 = True
	49	except ImportError:
	50	from BeautifulSoup import BeautifulSoup
	51	is_bs4 = False
	52
	53	__all__ = [
	54
	55	# Main search function.
	56	'search',
	57
	58	# Specialized search functions.
	59	'search_images', 'search_news',
	60	'search_videos', 'search_shop',
	61	'search_books', 'search_apps',
	62
	63	# Shortcut for "get lucky" search.
	64	'lucky',
	65
	66	# Computations based on the number of Google hits.
	67	'hits', 'ngd',
	68
	69	# Miscellaneous utility functions.
	70	'get_random_user_agent',
	71	]
	72
	73	# URL templates to make Google searches.
	74	url_home = "https://www.google.%(tld)s/"
	75	url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
	76	"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
	77	url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
	78	"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
	79	url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
	80	"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
	81	"tbm=%(tpe)s"
	82	url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
	83	"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
	84	"safe=%(safe)s&tbm=%(tpe)s"
	85
	86	# Cookie jar. Stored at the user's home folder.
	87	home_folder = os.getenv('HOME')
	88	if not home_folder:
	89	home_folder = os.getenv('USERHOME')
	90	if not home_folder:
	91	home_folder = '.' # Use the current folder on error.
	92	cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
	93	try:
	94	cookie_jar.load()
	95	except Exception:
	96	pass
	97
	98	# Default user agent, unless instructed by the user to change it.
	99	USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
	100
	101	# Load the list of valid user agents from the install folder.
	102	try:
	103	install_folder = os.path.abspath(os.path.split(__file__)[0])
	104	try:
	105	user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
	106	import gzip
	107	fp = gzip.open(user_agents_file, 'rb')
	108	try:
	109	user_agents_list = [_.strip() for _ in fp.readlines()]
	110	finally:
	111	fp.close()
	112	del fp
	113	except Exception:
	114	user_agents_file = os.path.join(install_folder, 'user_agents.txt')
	115	with open(user_agents_file) as fp:
	116	user_agents_list = [_.strip() for _ in fp.readlines()]
	117	except Exception:
	118	user_agents_list = [USER_AGENT]
	119
	120
	121	# Get a random user agent.
	122	def get_random_user_agent():
	123	"""
	124	Get a random user agent string.
	125
	126	:rtype: str
	127	:return: Random user agent string.
	128	"""
	129	return random.choice(user_agents_list)
	130
	131
	132	# Request the given URL and return the response page, using the cookie jar.
	133	def get_page(url, user_agent=None):
	134	"""
	135	Request the given URL and return the response page, using the cookie jar.
	136
	137	:param str url: URL to retrieve.
	138	:param str user_agent: User agent for the HTTP requests.
	139	Use None for the default.
	140
	141	:rtype: str
	142	:return: Web page retrieved for the given URL.
	143
	144	:raises IOError: An exception is raised on error.
	145	:raises urllib2.URLError: An exception is raised on error.
	146	:raises urllib2.HTTPError: An exception is raised on error.
	147	"""
	148	if user_agent is None:
	149	user_agent = USER_AGENT
	150	request = Request(url)
	151	request.add_header('User-Agent', user_agent)
	152	cookie_jar.add_cookie_header(request)
	153	response = urlopen(request)
	154	cookie_jar.extract_cookies(response, request)
	155	html = response.read()
	156	response.close()
	157	try:
	158	cookie_jar.save()
	159	except Exception:
	160	pass
	161	return html
	162
	163
	164	# Filter links found in the Google result pages HTML code.
	165	# Returns None if the link doesn't yield a valid result.
	166	def filter_result(link):
	167	try:
	168
	169	# Valid results are absolute URLs not pointing to a Google domain
	170	# like images.google.com or googleusercontent.com
	171	o = urlparse(link, 'http')
	172	if o.netloc and 'google' not in o.netloc:
	173	return link
	174
	175	# Decode hidden URLs.
	176	if link.startswith('/url?'):
	177	link = parse_qs(o.query)['q'][0]
	178
	179	# Valid results are absolute URLs not pointing to a Google domain
	180	# like images.google.com or googleusercontent.com
	181	o = urlparse(link, 'http')
	182	if o.netloc and 'google' not in o.netloc:
	183	return link
	184
	185	# Otherwise, or on error, return None.
	186	except Exception:
	187	pass
	188	return None
	189
	190
	191	# Returns a generator that yields URLs.
	192	def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
	193	stop=None, domains=None, pause=2.0, only_standard=False,
	194	extra_params={}, tpe='', user_agent=None):
	195	"""
	196	Search the given query string using Google.
	197
	198	:param str query: Query string. Must NOT be url-encoded.
	199	:param str tld: Top level domain.
	200	:param str lang: Language.
	201	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	202	"qdr:d" => last 24 hours, "qdr:m" => last month).
	203	:param str safe: Safe search.
	204	:param int num: Number of results per page.
	205	:param int start: First result to retrieve.
	206	:param int or None stop: Last result to retrieve.
	207	Use None to keep searching forever.
	208	:param list of str or None domains: A list of web domains to constrain
	209	the search.
	210	:param float pause: Lapse to wait between HTTP requests.
	211	A lapse too long will make the search slow, but a lapse too short may
	212	cause Google to block your IP. Your mileage may vary!
	213	:param bool only_standard: If True, only returns the standard results from
	214	each page. If False, it returns every possible link from each page,
	215	except for those that point back to Google itself. Defaults to False
	216	for backwards compatibility with older versions of this module.
	217	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	218	parameters, which must be URL encoded. For example if you don't want
	219	Google to filter similar results you can set the extra_params to
	220	{'filter': '0'} which will append '&filter=0' to every query.
	221	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	222	Use the following values {videos: 'vid', images: 'isch',
	223	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	224	:param str or None user_agent: User agent for the HTTP requests.
	225	Use None for the default.
	226
	227	:rtype: generator of str
	228	:return: Generator (iterator) that yields found URLs.
	229	If the stop parameter is None the iterator will loop forever.
	230	"""
	231	# Set of hashes for the results found.
	232	# This is used to avoid repeated results.
	233	hashes = set()
	234
	235	# Count the number of links yielded
	236	count = 0
	237
	238	# Prepare domain list if it exists.
	239	if domains:
	240	query = query + ' ' + ' OR '.join(
	241	'site:' + domain for domain in domains)
	242
	243	# Prepare the search string.
	244	query = quote_plus(query)
	245
	246	# Check extra_params for overlapping
	247	for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
	248	if builtin_param in extra_params.keys():
	249	raise ValueError(
	250	'GET parameter "%s" is overlapping with \
	251	the built-in GET parameter',
	252	builtin_param
	253	)
	254
	255	# Grab the cookie from the home page.
	256	get_page(url_home % vars(), user_agent)
	257
	258	# Prepare the URL of the first request.
	259	if start:
	260	if num == 10:
	261	url = url_next_page % vars()
	262	else:
	263	url = url_next_page_num % vars()
	264	else:
	265	if num == 10:
	266	url = url_search % vars()
	267	else:
	268	url = url_search_num % vars()
	269
	270	# Loop until we reach the maximum result, if any (otherwise, loop forever).
	271	while not stop or count < stop:
	272	# Remeber last count to detect the end of results
	273	last_count = count
	274
	275	try: # Is it python<3?
	276	iter_extra_params = extra_params.iteritems()
	277	except AttributeError: # Or python>3?
	278	iter_extra_params = extra_params.items()
	279	# Append extra GET_parameters to URL
	280	for k, v in iter_extra_params:
	281	url += url + ('&%s=%s' % (k, v))
	282
	283	# Sleep between requests.
	284	time.sleep(pause)
	285
	286	# Request the Google Search results page.
	287	html = get_page(url, user_agent)
	288
	289	# Parse the response and process every anchored URL.
	290	if is_bs4:
	291	soup = BeautifulSoup(html, 'html.parser')
	292	else:
	293	soup = BeautifulSoup(html)
	294	try:
	295	anchors = soup.find(id='search').findAll('a')
	296	# Sometimes (depending on the User-agent) there is
	297	# no id "search" in html response
	298	except AttributeError:
	299	# Remove links of the top bar
	300	gbar = soup.find(id='gbar')
	301	if gbar:
	302	gbar.clear()
	303	anchors = soup.findAll('a')
	304	for a in anchors:
	305
	306	# Leave only the "standard" results if requested.
	307	# Otherwise grab all possible links.
	308	if only_standard and (
	309	not a.parent or a.parent.name.lower() != "h3"):
	310	continue
	311
	312	# Get the URL from the anchor tag.
	313	try:
	314	link = a['href']
	315	except KeyError:
	316	continue
	317
	318	# Filter invalid links and links pointing to Google itself.
	319	link = filter_result(link)
	320	if not link:
	321	continue
	322
	323	# Discard repeated results.
	324	h = hash(link)
	325	if h in hashes:
	326	continue
	327	hashes.add(h)
	328
	329	# Yield the result.
	330	yield link
	331
	332	count += 1
	333	if stop and count >= stop:
	334	return
	335
	336	# End if there are no more results.
	337	if last_count == count:
	338	break
	339
	340	# Prepare the URL for the next request.
	341	start += num
	342	if num == 10:
	343	url = url_next_page % vars()
	344	else:
	345	url = url_next_page_num % vars()
	346
	347
	348	# Shortcut to search images.
	349	# Beware, this does not return the image link.
	350	def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
	351	start=0, stop=None, pause=2.0, domains=None,
	352	only_standard=False, extra_params={}):
	353	"""
	354	Shortcut to search images.
	355
	356	:note: Beware, this does not return the image link.
	357
	358	:param str query: Query string. Must NOT be url-encoded.
	359	:param str tld: Top level domain.
	360	:param str lang: Language.
	361	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	362	"qdr:d" => last 24 hours, "qdr:m" => last month).
	363	:param str safe: Safe search.
	364	:param int num: Number of results per page.
	365	:param int start: First result to retrieve.
	366	:param int or None stop: Last result to retrieve.
	367	Use None to keep searching forever.
	368	:param list of str or None domains: A list of web domains to constrain
	369	the search.
	370	:param float pause: Lapse to wait between HTTP requests.
	371	A lapse too long will make the search slow, but a lapse too short may
	372	cause Google to block your IP. Your mileage may vary!
	373	:param bool only_standard: If True, only returns the standard results from
	374	each page. If False, it returns every possible link from each page,
	375	except for those that point back to Google itself. Defaults to False
	376	for backwards compatibility with older versions of this module.
	377	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	378	parameters, which must be URL encoded. For example if you don't want
	379	Google to filter similar results you can set the extra_params to
	380	{'filter': '0'} which will append '&filter=0' to every query.
	381	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	382	Use the following values {videos: 'vid', images: 'isch',
	383	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	384	:param str or None user_agent: User agent for the HTTP requests.
	385	Use None for the default.
	386
	387	:rtype: generator of str
	388	:return: Generator (iterator) that yields found URLs.
	389	If the stop parameter is None the iterator will loop forever.
	390	"""
	391	return search(query, tld, lang, tbs, safe, num, start, stop, domains,
	392	pause, only_standard, extra_params, tpe='isch')
	393
	394
	395	# Shortcut to search news.
	396	def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
	397	start=0, stop=None, domains=None, pause=2.0,
	398	only_standard=False, extra_params={}):
	399	"""
	400	Shortcut to search news.
	401
	402	:param str query: Query string. Must NOT be url-encoded.
	403	:param str tld: Top level domain.
	404	:param str lang: Language.
	405	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	406	"qdr:d" => last 24 hours, "qdr:m" => last month).
	407	:param str safe: Safe search.
	408	:param int num: Number of results per page.
	409	:param int start: First result to retrieve.
	410	:param int or None stop: Last result to retrieve.
	411	Use None to keep searching forever.
	412	:param list of str or None domains: A list of web domains to constrain
	413	the search.
	414	:param float pause: Lapse to wait between HTTP requests.
	415	A lapse too long will make the search slow, but a lapse too short may
	416	cause Google to block your IP. Your mileage may vary!
	417	:param bool only_standard: If True, only returns the standard results from
	418	each page. If False, it returns every possible link from each page,
	419	except for those that point back to Google itself. Defaults to False
	420	for backwards compatibility with older versions of this module.
	421	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	422	parameters, which must be URL encoded. For example if you don't want
	423	Google to filter similar results you can set the extra_params to
	424	{'filter': '0'} which will append '&filter=0' to every query.
	425	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	426	Use the following values {videos: 'vid', images: 'isch',
	427	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	428	:param str or None user_agent: User agent for the HTTP requests.
	429	Use None for the default.
	430
	431	:rtype: generator of str
	432	:return: Generator (iterator) that yields found URLs.
	433	If the stop parameter is None the iterator will loop forever.
	434	"""
	435	return search(query, tld, lang, tbs, safe, num, start, stop, domains,
	436	pause, only_standard, extra_params, tpe='nws')
	437
	438
	439	# Shortcut to search videos.
	440	def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
	441	start=0, stop=None, domains=None, pause=2.0,
	442	only_standard=False, extra_params={}):
	443	"""
	444	Shortcut to search videos.
	445
	446	:param str query: Query string. Must NOT be url-encoded.
	447	:param str tld: Top level domain.
	448	:param str lang: Language.
	449	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	450	"qdr:d" => last 24 hours, "qdr:m" => last month).
	451	:param str safe: Safe search.
	452	:param int num: Number of results per page.
	453	:param int start: First result to retrieve.
	454	:param int or None stop: Last result to retrieve.
	455	Use None to keep searching forever.
	456	:param list of str or None domains: A list of web domains to constrain
	457	the search.
	458	:param float pause: Lapse to wait between HTTP requests.
	459	A lapse too long will make the search slow, but a lapse too short may
	460	cause Google to block your IP. Your mileage may vary!
	461	:param bool only_standard: If True, only returns the standard results from
	462	each page. If False, it returns every possible link from each page,
	463	except for those that point back to Google itself. Defaults to False
	464	for backwards compatibility with older versions of this module.
	465	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	466	parameters, which must be URL encoded. For example if you don't want
	467	Google to filter similar results you can set the extra_params to
	468	{'filter': '0'} which will append '&filter=0' to every query.
	469	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	470	Use the following values {videos: 'vid', images: 'isch',
	471	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	472	:param str or None user_agent: User agent for the HTTP requests.
	473	Use None for the default.
	474
	475	:rtype: generator of str
	476	:return: Generator (iterator) that yields found URLs.
	477	If the stop parameter is None the iterator will loop forever.
	478	"""
	479	return search(query, tld, lang, tbs, safe, num, start, stop, domains,
	480	pause, only_standard, extra_params, tpe='vid')
	481
	482
	483	# Shortcut to search shop.
	484	def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
	485	start=0, stop=None, domains=None, pause=2.0,
	486	only_standard=False, extra_params={}):
	487	"""
	488	Shortcut to search shop.
	489
	490	:param str query: Query string. Must NOT be url-encoded.
	491	:param str tld: Top level domain.
	492	:param str lang: Language.
	493	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	494	"qdr:d" => last 24 hours, "qdr:m" => last month).
	495	:param str safe: Safe search.
	496	:param int num: Number of results per page.
	497	:param int start: First result to retrieve.
	498	:param int or None stop: Last result to retrieve.
	499	Use None to keep searching forever.
	500	:param list of str or None domains: A list of web domains to constrain
	501	the search.
	502	:param float pause: Lapse to wait between HTTP requests.
	503	A lapse too long will make the search slow, but a lapse too short may
	504	cause Google to block your IP. Your mileage may vary!
	505	:param bool only_standard: If True, only returns the standard results from
	506	each page. If False, it returns every possible link from each page,
	507	except for those that point back to Google itself. Defaults to False
	508	for backwards compatibility with older versions of this module.
	509	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	510	parameters, which must be URL encoded. For example if you don't want
	511	Google to filter similar results you can set the extra_params to
	512	{'filter': '0'} which will append '&filter=0' to every query.
	513	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	514	Use the following values {videos: 'vid', images: 'isch',
	515	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	516	:param str or None user_agent: User agent for the HTTP requests.
	517	Use None for the default.
	518
	519	:rtype: generator of str
	520	:return: Generator (iterator) that yields found URLs.
	521	If the stop parameter is None the iterator will loop forever.
	522	"""
	523	return search(query, tld, lang, tbs, safe, num, start, stop, domains,
	524	pause, only_standard, extra_params, tpe='shop')
	525
	526
	527	# Shortcut to search books.
	528	def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
	529	start=0, stop=None, domains=None, pause=2.0,
	530	only_standard=False, extra_params={}):
	531	"""
	532	Shortcut to search books.
	533
	534	:param str query: Query string. Must NOT be url-encoded.
	535	:param str tld: Top level domain.
	536	:param str lang: Language.
	537	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	538	"qdr:d" => last 24 hours, "qdr:m" => last month).
	539	:param str safe: Safe search.
	540	:param int num: Number of results per page.
	541	:param int start: First result to retrieve.
	542	:param int or None stop: Last result to retrieve.
	543	Use None to keep searching forever.
	544	:param list of str or None domains: A list of web domains to constrain
	545	the search.
	546	:param float pause: Lapse to wait between HTTP requests.
	547	A lapse too long will make the search slow, but a lapse too short may
	548	cause Google to block your IP. Your mileage may vary!
	549	:param bool only_standard: If True, only returns the standard results from
	550	each page. If False, it returns every possible link from each page,
	551	except for those that point back to Google itself. Defaults to False
	552	for backwards compatibility with older versions of this module.
	553	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	554	parameters, which must be URL encoded. For example if you don't want
	555	Google to filter similar results you can set the extra_params to
	556	{'filter': '0'} which will append '&filter=0' to every query.
	557	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	558	Use the following values {videos: 'vid', images: 'isch',
	559	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	560	:param str or None user_agent: User agent for the HTTP requests.
	561	Use None for the default.
	562
	563	:rtype: generator of str
	564	:return: Generator (iterator) that yields found URLs.
	565	If the stop parameter is None the iterator will loop forever.
	566	"""
	567	return search(query, tld, lang, tbs, safe, num, start, stop, domains,
	568	pause, only_standard, extra_params, tpe='bks')
	569
	570
	571	# Shortcut to search apps.
	572	def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
	573	start=0, stop=None, domains=None, pause=2.0,
	574	only_standard=False, extra_params={}):
	575	"""
	576	Shortcut to search apps.
	577
	578	:param str query: Query string. Must NOT be url-encoded.
	579	:param str tld: Top level domain.
	580	:param str lang: Language.
	581	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	582	"qdr:d" => last 24 hours, "qdr:m" => last month).
	583	:param str safe: Safe search.
	584	:param int num: Number of results per page.
	585	:param int start: First result to retrieve.
	586	:param int or None stop: Last result to retrieve.
	587	Use None to keep searching forever.
	588	:param list of str or None domains: A list of web domains to constrain
	589	the search.
	590	:param float pause: Lapse to wait between HTTP requests.
	591	A lapse too long will make the search slow, but a lapse too short may
	592	cause Google to block your IP. Your mileage may vary!
	593	:param bool only_standard: If True, only returns the standard results from
	594	each page. If False, it returns every possible link from each page,
	595	except for those that point back to Google itself. Defaults to False
	596	for backwards compatibility with older versions of this module.
	597	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	598	parameters, which must be URL encoded. For example if you don't want
	599	Google to filter similar results you can set the extra_params to
	600	{'filter': '0'} which will append '&filter=0' to every query.
	601	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	602	Use the following values {videos: 'vid', images: 'isch',
	603	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	604	:param str or None user_agent: User agent for the HTTP requests.
	605	Use None for the default.
	606
	607	:rtype: generator of str
	608	:return: Generator (iterator) that yields found URLs.
	609	If the stop parameter is None the iterator will loop forever.
	610	"""
	611	return search(query, tld, lang, tbs, safe, num, start, stop, domains,
	612	pause, only_standard, extra_params, tpe='app')
	613
	614
	615	# Shortcut to single-item search.
	616	# Evaluates the iterator to return the single URL as a string.
	617	def lucky(query, tld='com', lang='en', tbs='0', safe='off',
	618	only_standard=False, extra_params={}, tpe=''):
	619	"""
	620	Shortcut to single-item search.
	621
	622	:param str query: Query string. Must NOT be url-encoded.
	623	:param str tld: Top level domain.
	624	:param str lang: Language.
	625	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	626	"qdr:d" => last 24 hours, "qdr:m" => last month).
	627	:param str safe: Safe search.
	628	:param int num: Number of results per page.
	629	:param int start: First result to retrieve.
	630	:param int or None stop: Last result to retrieve.
	631	Use None to keep searching forever.
	632	:param list of str or None domains: A list of web domains to constrain
	633	the search.
	634	:param float pause: Lapse to wait between HTTP requests.
	635	A lapse too long will make the search slow, but a lapse too short may
	636	cause Google to block your IP. Your mileage may vary!
	637	:param bool only_standard: If True, only returns the standard results from
	638	each page. If False, it returns every possible link from each page,
	639	except for those that point back to Google itself. Defaults to False
	640	for backwards compatibility with older versions of this module.
	641	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	642	parameters, which must be URL encoded. For example if you don't want
	643	Google to filter similar results you can set the extra_params to
	644	{'filter': '0'} which will append '&filter=0' to every query.
	645	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	646	Use the following values {videos: 'vid', images: 'isch',
	647	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	648	:param str or None user_agent: User agent for the HTTP requests.
	649	Use None for the default.
	650
	651	:rtype: str
	652	:return: URL found by Google.
	653	"""
	654	gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
	655	extra_params, tpe)
	656	return next(gen)
	657
	658
	659	# Returns only the number of Google hits for the given search query.
	660	# This is the number reported by Google itself, NOT by scraping.
	661	def hits(query, tld='com', lang='en', tbs='0', safe='off',
	662	domains=None, extra_params={}, tpe='', user_agent=None):
	663	"""
	664	Search the given query string using Google and return the number of hits.
	665
	666	:note: This is the number reported by Google itself, NOT by scraping.
	667
	668	:param str query: Query string. Must NOT be url-encoded.
	669	:param str tld: Top level domain.
	670	:param str lang: Language.
	671	:param str tbs: Time limits (i.e "qdr:h" => last hour,
	672	"qdr:d" => last 24 hours, "qdr:m" => last month).
	673	:param str safe: Safe search.
	674	:param int num: Number of results per page.
	675	:param int start: First result to retrieve.
	676	:param int or None stop: Last result to retrieve.
	677	Use None to keep searching forever.
	678	:param list of str or None domains: A list of web domains to constrain
	679	the search.
	680	:param float pause: Lapse to wait between HTTP requests.
	681	A lapse too long will make the search slow, but a lapse too short may
	682	cause Google to block your IP. Your mileage may vary!
	683	:param bool only_standard: If True, only returns the standard results from
	684	each page. If False, it returns every possible link from each page,
	685	except for those that point back to Google itself. Defaults to False
	686	for backwards compatibility with older versions of this module.
	687	:param dict of str to str extra_params: A dictionary of extra HTTP GET
	688	parameters, which must be URL encoded. For example if you don't want
	689	Google to filter similar results you can set the extra_params to
	690	{'filter': '0'} which will append '&filter=0' to every query.
	691	:param str tpe: Search type (images, videos, news, shopping, books, apps)
	692	Use the following values {videos: 'vid', images: 'isch',
	693	news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
	694	:param str or None user_agent: User agent for the HTTP requests.
	695	Use None for the default.
	696
	697	:rtype: int
	698	:return: Number of Google hits for the given search query.
	699	"""
	700
	701	# Prepare domain list if it exists.
	702	if domains:
	703	domain_query = '+OR+'.join('site:' + domain for domain in domains)
	704	domain_query = '+' + domain_query
	705	else:
	706	domain_query = ''
	707
	708	# Prepare the search string.
	709	query = quote_plus(query + domain_query)
	710
	711	# Check extra_params for overlapping
	712	for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
	713	if builtin_param in extra_params.keys():
	714	raise ValueError(
	715	'GET parameter "%s" is overlapping with \
	716	the built-in GET parameter',
	717	builtin_param
	718	)
	719
	720	# Grab the cookie from the home page.
	721	get_page(url_home % vars(), user_agent)
	722
	723	# Prepare the URL of the first (and in this cases ONLY) request.
	724	url = url_search % vars()
	725
	726	try: # Is it python<3?
	727	iter_extra_params = extra_params.iteritems()
	728	except AttributeError: # Or python>3?
	729	iter_extra_params = extra_params.items()
	730	# Append extra GET_parameters to URL
	731	for k, v in iter_extra_params:
	732	url += url + ('&%s=%s' % (k, v))
	733
	734	# Request the Google Search results page.
	735	html = get_page(url, user_agent)
	736
	737	# Parse the response.
	738	if is_bs4:
	739	soup = BeautifulSoup(html, 'html.parser')
	740	else:
	741	soup = BeautifulSoup(html)
	742
	743	# Get the number of hits.
	744	tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
	745	hits_text_parts = tag.text.split()
	746	if len(hits_text_parts) < 3:
	747	return 0
	748	return int(hits_text_parts[1].replace(',', '').replace('.', ''))
	749
	750
	751	def ngd(term1, term2):
	752	"""
	753	Return the Normalized Google distance between words.
	754
	755	For more info, refer to:
	756	https://en.wikipedia.org/wiki/Normalized_Google_distance
	757
	758	:param str term1: First term to compare.
	759	:param str term2: Second term to compare.
	760
	761	:rtype: float
	762	:return: Normalized Google distance between words.
	763	"""
	764
	765	lhits1 = math.log10(hits(term1))
	766	lhits2 = math.log10(hits(term2))
	767	lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
	768	npages = hits('the')
	769	fix = 1000
	770
	771	lN = math.log10(npages * fix)
	772	numerator = max([lhits1, lhits2]) - lhits_mix
	773	denomin = lN - min([lhits1, lhits2])
	774
	775	return numerator / denomin

googlesearch/user_agents.txt.gz less more

Binary diff not shown

-0

requirements.txt less more

beautifulsoup4>=4.0

+104

-0

scripts/google less more

	0	#!/usr/bin/env python
	1
	2	# Python bindings to the Google search engine
	3	# Copyright (c) 2009-2016, Mario Vilas
	4	# All rights reserved.
	5	#
	6	# Redistribution and use in source and binary forms, with or without
	7	# modification, are permitted provided that the following conditions are met:
	8	#
	9	# * Redistributions of source code must retain the above copyright notice,
	10	# this list of conditions and the following disclaimer.
	11	# * Redistributions in binary form must reproduce the above copyright
	12	# notice,this list of conditions and the following disclaimer in the
	13	# documentation and/or other materials provided with the distribution.
	14	# * Neither the name of the copyright holder nor the names of its
	15	# contributors may be used to endorse or promote products derived from
	16	# this software without specific prior written permission.
	17	#
	18	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	19	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	20	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	21	# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	22	# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	23	# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	24	# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	25	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	26	# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	27	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	28	# POSSIBILITY OF SUCH DAMAGE.
	29
	30	import sys
	31
	32	from googlesearch import search, get_random_user_agent
	33
	34	from optparse import OptionParser, IndentedHelpFormatter
	35
	36	class BannerHelpFormatter(IndentedHelpFormatter):
	37
	38	"Just a small tweak to optparse to be able to print a banner."
	39
	40	def __init__(self, banner, argv, *argd):
	41	self.banner = banner
	42	IndentedHelpFormatter.__init__(self, argv, *argd)
	43
	44	def format_usage(self, usage):
	45	msg = IndentedHelpFormatter.format_usage(self, usage)
	46	return '%s\n%s' % (self.banner, msg)
	47
	48	# Parse the command line arguments.
	49	formatter = BannerHelpFormatter(
	50	"Python script to use the Google search engine\n"
	51	"By Mario Vilas (mvilas at gmail dot com)\n"
	52	"https://github.com/MarioVilas/googlesearch\n"
	53	)
	54	parser = OptionParser(formatter=formatter)
	55	parser.set_usage("%prog [options] query")
	56	parser.add_option("--tld", metavar="TLD", type="string", default="com",
	57	help="top level domain to use [default: com]")
	58	parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
	59	help="produce results in the given language [default: en]")
	60	parser.add_option("--domains", metavar="DOMAINS", type="string", default="",
	61	help="comma separated list of domains to constrain the search to")
	62	parser.add_option("--tbs", metavar="TBS", type="string", default="0",
	63	help="produce results from period [default: 0]")
	64	parser.add_option("--safe", metavar="SAFE", type="string", default="off",
	65	help="kids safe search [default: off]")
	66	parser.add_option("--num", metavar="NUMBER", type="int", default=10,
	67	help="number of results per page [default: 10]")
	68	parser.add_option("--start", metavar="NUMBER", type="int", default=0,
	69	help="first result to retrieve [default: 0]")
	70	parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
	71	help="last result to retrieve [default: unlimited]")
	72	parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
	73	help="pause between HTTP requests [default: 2.0]")
	74	parser.add_option("--rua", metavar="USERAGENT", action="store_true", default=False,
	75	help="Randomize the User-Agent [default: no]")
	76	parser.add_option("--all", dest="only_standard",
	77	action="store_false", default=True,
	78	help="grab all possible links from result pages [default: only standard results]")
	79	(options, args) = parser.parse_args()
	80	query = ' '.join(args)
	81	if not query:
	82	parser.print_help()
	83	sys.exit(2)
	84	params = [(k, v) for (k, v) in options.__dict__.items() if not k.startswith('_')]
	85	params = dict(params)
	86
	87	# Split the comma separated list of domains, if present.
	88	if 'domains' in params:
	89	params['domains'] = [x.strip() for x in params['domains'].split(',')]
	90
	91	# Randomize the user agent if requested.
	92	if "rua" in params:
	93	rua = params.pop("rua")
	94	if rua:
	95	params["user_agent"] = get_random_user_agent()
	96
	97	# Run the query.
	98	for url in search(query, **params):
	99	print(url)
	100	try:
	101	sys.stdout.flush()
	102	except:
	103	pass

-0

setup.cfg less more

	0	[bdist_wheel]
	1	universal = 1
	2
	3	[egg_info]
	4	tag_build =
	5	tag_date = 0
	6

+77

-0

setup.py less more

	0	#!/usr/bin/env python
	1
	2	# Copyright (c) 2009-2019, Mario Vilas
	3	# All rights reserved.
	4	#
	5	# Redistribution and use in source and binary forms, with or without
	6	# modification, are permitted provided that the following conditions are met:
	7	#
	8	# * Redistributions of source code must retain the above copyright notice,
	9	# this list of conditions and the following disclaimer.
	10	# * Redistributions in binary form must reproduce the above copyright
	11	# notice,this list of conditions and the following disclaimer in the
	12	# documentation and/or other materials provided with the distribution.
	13	# * Neither the name of the copyright holder nor the names of its
	14	# contributors may be used to endorse or promote products derived from
	15	# this software without specific prior written permission.
	16	#
	17	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	18	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	19	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	20	# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	21	# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	22	# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	23	# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	24	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	25	# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	26	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	27	# POSSIBILITY OF SUCH DAMAGE.
	28
	29	from os import chdir
	30	from os.path import abspath, join, split
	31
	32	# Make sure we are standing in the correct directory.
	33	# Old versions of distutils didn't take care of this.
	34	here = split(abspath(__file__))[0]
	35	chdir(here)
	36
	37	# Package metadata.
	38	metadata = dict(
	39	name='google',
	40	provides=['googlesearch'],
	41	requires=['beautifulsoup4'],
	42	packages=['googlesearch'],
	43	scripts=[join('scripts', 'google')],
	44	package_data={'googlesearch': ['user_agents.txt.gz']},
	45	include_package_data=True,
	46	version="2.0.2",
	47	description="Python bindings to the Google search engine.",
	48	author="Mario Vilas",
	49	author_email="[email protected]",
	50	url="http://breakingcode.wordpress.com/",
	51	classifiers=[
	52	"Development Status :: 5 - Production/Stable",
	53	"Intended Audience :: Developers",
	54	"License :: OSI Approved :: BSD License",
	55	"Environment :: Console",
	56	"Programming Language :: Python",
	57	"Topic :: Software Development :: Libraries :: Python Modules",
	58	],
	59	)
	60
	61	# Prefer setuptools over the old distutils.
	62	# If setuptools is available, use install_requires.
	63	try:
	64	from setuptools import setup
	65	metadata['install_requires'] = metadata['requires']
	66	except ImportError:
	67	from distutils.core import setup
	68
	69	# Get the long description from the readme file.
	70	try:
	71	metadata['long_description'] = open(join(here, 'README.md'), 'rU').read()
	72	except Exception:
	73	pass
	74
	75	# Run the setup script.
	76	setup(**metadata)