Codebase list python-googlesearch / upstream/2.0.2
New upstream version 2.0.2 Sophie Brun 4 years ago
14 changed file(s) with 1080 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 include README.md
1 include MANIFEST.in
2 include setup.py
3 include scripts/google
4 include requirements.txt
5 include googlesearch/user_agents.txt.gz
0 Metadata-Version: 1.1
1 Name: google
2 Version: 2.0.2
3 Summary: Python bindings to the Google search engine.
4 Home-page: http://breakingcode.wordpress.com/
5 Author: Mario Vilas
6 Author-email: [email protected]
7 License: UNKNOWN
8 Description: googlesearch
9 ============
10
11 Google search from Python.
12
13 https://python-googlesearch.readthedocs.io/en/latest/
14
15 Usage example
16 -------------
17
18 # Get the first 20 hits for: "Breaking Code" WordPress blog
19 from googlesearch import search
20 for url in search('"Breaking Code" WordPress blog', stop=20):
21 print(url)
22
23 Installing
24 ----------
25
26 pip install google
27
28 Platform: UNKNOWN
29 Classifier: Development Status :: 5 - Production/Stable
30 Classifier: Intended Audience :: Developers
31 Classifier: License :: OSI Approved :: BSD License
32 Classifier: Environment :: Console
33 Classifier: Programming Language :: Python
34 Classifier: Topic :: Software Development :: Libraries :: Python Modules
35 Requires: beautifulsoup4
36 Provides: googlesearch
0 googlesearch
1 ============
2
3 Google search from Python.
4
5 https://python-googlesearch.readthedocs.io/en/latest/
6
7 Usage example
8 -------------
9
10 # Get the first 20 hits for: "Breaking Code" WordPress blog
11 from googlesearch import search
12 for url in search('"Breaking Code" WordPress blog', stop=20):
13 print(url)
14
15 Installing
16 ----------
17
18 pip install google
0 Metadata-Version: 1.1
1 Name: google
2 Version: 2.0.2
3 Summary: Python bindings to the Google search engine.
4 Home-page: http://breakingcode.wordpress.com/
5 Author: Mario Vilas
6 Author-email: [email protected]
7 License: UNKNOWN
8 Description: googlesearch
9 ============
10
11 Google search from Python.
12
13 https://python-googlesearch.readthedocs.io/en/latest/
14
15 Usage example
16 -------------
17
18 # Get the first 20 hits for: "Breaking Code" WordPress blog
19 from googlesearch import search
20 for url in search('"Breaking Code" WordPress blog', stop=20):
21 print(url)
22
23 Installing
24 ----------
25
26 pip install google
27
28 Platform: UNKNOWN
29 Classifier: Development Status :: 5 - Production/Stable
30 Classifier: Intended Audience :: Developers
31 Classifier: License :: OSI Approved :: BSD License
32 Classifier: Environment :: Console
33 Classifier: Programming Language :: Python
34 Classifier: Topic :: Software Development :: Libraries :: Python Modules
35 Requires: beautifulsoup4
36 Provides: googlesearch
0 MANIFEST.in
1 README.md
2 requirements.txt
3 setup.cfg
4 setup.py
5 google.egg-info/PKG-INFO
6 google.egg-info/SOURCES.txt
7 google.egg-info/dependency_links.txt
8 google.egg-info/requires.txt
9 google.egg-info/top_level.txt
10 googlesearch/__init__.py
11 googlesearch/user_agents.txt.gz
12 scripts/google
0 #!/usr/bin/env python
1
2 # Python bindings to the Google search engine
3 # Copyright (c) 2009-2018, Mario Vilas
4 # All rights reserved.
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
8 #
9 # * Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 # * Redistributions in binary form must reproduce the above copyright
12 # notice,this list of conditions and the following disclaimer in the
13 # documentation and/or other materials provided with the distribution.
14 # * Neither the name of the copyright holder nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 # POSSIBILITY OF SUCH DAMAGE.
29
30 import os
31 import random
32 import sys
33 import time
34 import math
35
36 if sys.version_info[0] > 2:
37 from http.cookiejar import LWPCookieJar
38 from urllib.request import Request, urlopen
39 from urllib.parse import quote_plus, urlparse, parse_qs
40 else:
41 from cookielib import LWPCookieJar
42 from urllib import quote_plus
43 from urllib2 import Request, urlopen
44 from urlparse import urlparse, parse_qs
45
46 try:
47 from bs4 import BeautifulSoup
48 is_bs4 = True
49 except ImportError:
50 from BeautifulSoup import BeautifulSoup
51 is_bs4 = False
52
53 __all__ = [
54
55 # Main search function.
56 'search',
57
58 # Specialized search functions.
59 'search_images', 'search_news',
60 'search_videos', 'search_shop',
61 'search_books', 'search_apps',
62
63 # Shortcut for "get lucky" search.
64 'lucky',
65
66 # Computations based on the number of Google hits.
67 'hits', 'ngd',
68
69 # Miscellaneous utility functions.
70 'get_random_user_agent',
71 ]
72
73 # URL templates to make Google searches.
74 url_home = "https://www.google.%(tld)s/"
75 url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
76 "btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
77 url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
78 "start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s"
79 url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
80 "num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
81 "tbm=%(tpe)s"
82 url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
83 "q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
84 "safe=%(safe)s&tbm=%(tpe)s"
85
86 # Cookie jar. Stored at the user's home folder.
87 home_folder = os.getenv('HOME')
88 if not home_folder:
89 home_folder = os.getenv('USERHOME')
90 if not home_folder:
91 home_folder = '.' # Use the current folder on error.
92 cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
93 try:
94 cookie_jar.load()
95 except Exception:
96 pass
97
98 # Default user agent, unless instructed by the user to change it.
99 USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'
100
101 # Load the list of valid user agents from the install folder.
102 try:
103 install_folder = os.path.abspath(os.path.split(__file__)[0])
104 try:
105 user_agents_file = os.path.join(install_folder, 'user_agents.txt.gz')
106 import gzip
107 fp = gzip.open(user_agents_file, 'rb')
108 try:
109 user_agents_list = [_.strip() for _ in fp.readlines()]
110 finally:
111 fp.close()
112 del fp
113 except Exception:
114 user_agents_file = os.path.join(install_folder, 'user_agents.txt')
115 with open(user_agents_file) as fp:
116 user_agents_list = [_.strip() for _ in fp.readlines()]
117 except Exception:
118 user_agents_list = [USER_AGENT]
119
120
121 # Get a random user agent.
122 def get_random_user_agent():
123 """
124 Get a random user agent string.
125
126 :rtype: str
127 :return: Random user agent string.
128 """
129 return random.choice(user_agents_list)
130
131
132 # Request the given URL and return the response page, using the cookie jar.
133 def get_page(url, user_agent=None):
134 """
135 Request the given URL and return the response page, using the cookie jar.
136
137 :param str url: URL to retrieve.
138 :param str user_agent: User agent for the HTTP requests.
139 Use None for the default.
140
141 :rtype: str
142 :return: Web page retrieved for the given URL.
143
144 :raises IOError: An exception is raised on error.
145 :raises urllib2.URLError: An exception is raised on error.
146 :raises urllib2.HTTPError: An exception is raised on error.
147 """
148 if user_agent is None:
149 user_agent = USER_AGENT
150 request = Request(url)
151 request.add_header('User-Agent', user_agent)
152 cookie_jar.add_cookie_header(request)
153 response = urlopen(request)
154 cookie_jar.extract_cookies(response, request)
155 html = response.read()
156 response.close()
157 try:
158 cookie_jar.save()
159 except Exception:
160 pass
161 return html
162
163
164 # Filter links found in the Google result pages HTML code.
165 # Returns None if the link doesn't yield a valid result.
166 def filter_result(link):
167 try:
168
169 # Valid results are absolute URLs not pointing to a Google domain
170 # like images.google.com or googleusercontent.com
171 o = urlparse(link, 'http')
172 if o.netloc and 'google' not in o.netloc:
173 return link
174
175 # Decode hidden URLs.
176 if link.startswith('/url?'):
177 link = parse_qs(o.query)['q'][0]
178
179 # Valid results are absolute URLs not pointing to a Google domain
180 # like images.google.com or googleusercontent.com
181 o = urlparse(link, 'http')
182 if o.netloc and 'google' not in o.netloc:
183 return link
184
185 # Otherwise, or on error, return None.
186 except Exception:
187 pass
188 return None
189
190
191 # Returns a generator that yields URLs.
192 def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
193 stop=None, domains=None, pause=2.0, only_standard=False,
194 extra_params={}, tpe='', user_agent=None):
195 """
196 Search the given query string using Google.
197
198 :param str query: Query string. Must NOT be url-encoded.
199 :param str tld: Top level domain.
200 :param str lang: Language.
201 :param str tbs: Time limits (i.e "qdr:h" => last hour,
202 "qdr:d" => last 24 hours, "qdr:m" => last month).
203 :param str safe: Safe search.
204 :param int num: Number of results per page.
205 :param int start: First result to retrieve.
206 :param int or None stop: Last result to retrieve.
207 Use None to keep searching forever.
208 :param list of str or None domains: A list of web domains to constrain
209 the search.
210 :param float pause: Lapse to wait between HTTP requests.
211 A lapse too long will make the search slow, but a lapse too short may
212 cause Google to block your IP. Your mileage may vary!
213 :param bool only_standard: If True, only returns the standard results from
214 each page. If False, it returns every possible link from each page,
215 except for those that point back to Google itself. Defaults to False
216 for backwards compatibility with older versions of this module.
217 :param dict of str to str extra_params: A dictionary of extra HTTP GET
218 parameters, which must be URL encoded. For example if you don't want
219 Google to filter similar results you can set the extra_params to
220 {'filter': '0'} which will append '&filter=0' to every query.
221 :param str tpe: Search type (images, videos, news, shopping, books, apps)
222 Use the following values {videos: 'vid', images: 'isch',
223 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
224 :param str or None user_agent: User agent for the HTTP requests.
225 Use None for the default.
226
227 :rtype: generator of str
228 :return: Generator (iterator) that yields found URLs.
229 If the stop parameter is None the iterator will loop forever.
230 """
231 # Set of hashes for the results found.
232 # This is used to avoid repeated results.
233 hashes = set()
234
235 # Count the number of links yielded
236 count = 0
237
238 # Prepare domain list if it exists.
239 if domains:
240 query = query + ' ' + ' OR '.join(
241 'site:' + domain for domain in domains)
242
243 # Prepare the search string.
244 query = quote_plus(query)
245
246 # Check extra_params for overlapping
247 for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
248 if builtin_param in extra_params.keys():
249 raise ValueError(
250 'GET parameter "%s" is overlapping with \
251 the built-in GET parameter',
252 builtin_param
253 )
254
255 # Grab the cookie from the home page.
256 get_page(url_home % vars(), user_agent)
257
258 # Prepare the URL of the first request.
259 if start:
260 if num == 10:
261 url = url_next_page % vars()
262 else:
263 url = url_next_page_num % vars()
264 else:
265 if num == 10:
266 url = url_search % vars()
267 else:
268 url = url_search_num % vars()
269
270 # Loop until we reach the maximum result, if any (otherwise, loop forever).
271 while not stop or count < stop:
272 # Remeber last count to detect the end of results
273 last_count = count
274
275 try: # Is it python<3?
276 iter_extra_params = extra_params.iteritems()
277 except AttributeError: # Or python>3?
278 iter_extra_params = extra_params.items()
279 # Append extra GET_parameters to URL
280 for k, v in iter_extra_params:
281 url += url + ('&%s=%s' % (k, v))
282
283 # Sleep between requests.
284 time.sleep(pause)
285
286 # Request the Google Search results page.
287 html = get_page(url, user_agent)
288
289 # Parse the response and process every anchored URL.
290 if is_bs4:
291 soup = BeautifulSoup(html, 'html.parser')
292 else:
293 soup = BeautifulSoup(html)
294 try:
295 anchors = soup.find(id='search').findAll('a')
296 # Sometimes (depending on the User-agent) there is
297 # no id "search" in html response
298 except AttributeError:
299 # Remove links of the top bar
300 gbar = soup.find(id='gbar')
301 if gbar:
302 gbar.clear()
303 anchors = soup.findAll('a')
304 for a in anchors:
305
306 # Leave only the "standard" results if requested.
307 # Otherwise grab all possible links.
308 if only_standard and (
309 not a.parent or a.parent.name.lower() != "h3"):
310 continue
311
312 # Get the URL from the anchor tag.
313 try:
314 link = a['href']
315 except KeyError:
316 continue
317
318 # Filter invalid links and links pointing to Google itself.
319 link = filter_result(link)
320 if not link:
321 continue
322
323 # Discard repeated results.
324 h = hash(link)
325 if h in hashes:
326 continue
327 hashes.add(h)
328
329 # Yield the result.
330 yield link
331
332 count += 1
333 if stop and count >= stop:
334 return
335
336 # End if there are no more results.
337 if last_count == count:
338 break
339
340 # Prepare the URL for the next request.
341 start += num
342 if num == 10:
343 url = url_next_page % vars()
344 else:
345 url = url_next_page_num % vars()
346
347
348 # Shortcut to search images.
349 # Beware, this does not return the image link.
350 def search_images(query, tld='com', lang='en', tbs='0', safe='off', num=10,
351 start=0, stop=None, pause=2.0, domains=None,
352 only_standard=False, extra_params={}):
353 """
354 Shortcut to search images.
355
356 :note: Beware, this does not return the image link.
357
358 :param str query: Query string. Must NOT be url-encoded.
359 :param str tld: Top level domain.
360 :param str lang: Language.
361 :param str tbs: Time limits (i.e "qdr:h" => last hour,
362 "qdr:d" => last 24 hours, "qdr:m" => last month).
363 :param str safe: Safe search.
364 :param int num: Number of results per page.
365 :param int start: First result to retrieve.
366 :param int or None stop: Last result to retrieve.
367 Use None to keep searching forever.
368 :param list of str or None domains: A list of web domains to constrain
369 the search.
370 :param float pause: Lapse to wait between HTTP requests.
371 A lapse too long will make the search slow, but a lapse too short may
372 cause Google to block your IP. Your mileage may vary!
373 :param bool only_standard: If True, only returns the standard results from
374 each page. If False, it returns every possible link from each page,
375 except for those that point back to Google itself. Defaults to False
376 for backwards compatibility with older versions of this module.
377 :param dict of str to str extra_params: A dictionary of extra HTTP GET
378 parameters, which must be URL encoded. For example if you don't want
379 Google to filter similar results you can set the extra_params to
380 {'filter': '0'} which will append '&filter=0' to every query.
381 :param str tpe: Search type (images, videos, news, shopping, books, apps)
382 Use the following values {videos: 'vid', images: 'isch',
383 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
384 :param str or None user_agent: User agent for the HTTP requests.
385 Use None for the default.
386
387 :rtype: generator of str
388 :return: Generator (iterator) that yields found URLs.
389 If the stop parameter is None the iterator will loop forever.
390 """
391 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
392 pause, only_standard, extra_params, tpe='isch')
393
394
395 # Shortcut to search news.
396 def search_news(query, tld='com', lang='en', tbs='0', safe='off', num=10,
397 start=0, stop=None, domains=None, pause=2.0,
398 only_standard=False, extra_params={}):
399 """
400 Shortcut to search news.
401
402 :param str query: Query string. Must NOT be url-encoded.
403 :param str tld: Top level domain.
404 :param str lang: Language.
405 :param str tbs: Time limits (i.e "qdr:h" => last hour,
406 "qdr:d" => last 24 hours, "qdr:m" => last month).
407 :param str safe: Safe search.
408 :param int num: Number of results per page.
409 :param int start: First result to retrieve.
410 :param int or None stop: Last result to retrieve.
411 Use None to keep searching forever.
412 :param list of str or None domains: A list of web domains to constrain
413 the search.
414 :param float pause: Lapse to wait between HTTP requests.
415 A lapse too long will make the search slow, but a lapse too short may
416 cause Google to block your IP. Your mileage may vary!
417 :param bool only_standard: If True, only returns the standard results from
418 each page. If False, it returns every possible link from each page,
419 except for those that point back to Google itself. Defaults to False
420 for backwards compatibility with older versions of this module.
421 :param dict of str to str extra_params: A dictionary of extra HTTP GET
422 parameters, which must be URL encoded. For example if you don't want
423 Google to filter similar results you can set the extra_params to
424 {'filter': '0'} which will append '&filter=0' to every query.
425 :param str tpe: Search type (images, videos, news, shopping, books, apps)
426 Use the following values {videos: 'vid', images: 'isch',
427 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
428 :param str or None user_agent: User agent for the HTTP requests.
429 Use None for the default.
430
431 :rtype: generator of str
432 :return: Generator (iterator) that yields found URLs.
433 If the stop parameter is None the iterator will loop forever.
434 """
435 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
436 pause, only_standard, extra_params, tpe='nws')
437
438
439 # Shortcut to search videos.
440 def search_videos(query, tld='com', lang='en', tbs='0', safe='off', num=10,
441 start=0, stop=None, domains=None, pause=2.0,
442 only_standard=False, extra_params={}):
443 """
444 Shortcut to search videos.
445
446 :param str query: Query string. Must NOT be url-encoded.
447 :param str tld: Top level domain.
448 :param str lang: Language.
449 :param str tbs: Time limits (i.e "qdr:h" => last hour,
450 "qdr:d" => last 24 hours, "qdr:m" => last month).
451 :param str safe: Safe search.
452 :param int num: Number of results per page.
453 :param int start: First result to retrieve.
454 :param int or None stop: Last result to retrieve.
455 Use None to keep searching forever.
456 :param list of str or None domains: A list of web domains to constrain
457 the search.
458 :param float pause: Lapse to wait between HTTP requests.
459 A lapse too long will make the search slow, but a lapse too short may
460 cause Google to block your IP. Your mileage may vary!
461 :param bool only_standard: If True, only returns the standard results from
462 each page. If False, it returns every possible link from each page,
463 except for those that point back to Google itself. Defaults to False
464 for backwards compatibility with older versions of this module.
465 :param dict of str to str extra_params: A dictionary of extra HTTP GET
466 parameters, which must be URL encoded. For example if you don't want
467 Google to filter similar results you can set the extra_params to
468 {'filter': '0'} which will append '&filter=0' to every query.
469 :param str tpe: Search type (images, videos, news, shopping, books, apps)
470 Use the following values {videos: 'vid', images: 'isch',
471 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
472 :param str or None user_agent: User agent for the HTTP requests.
473 Use None for the default.
474
475 :rtype: generator of str
476 :return: Generator (iterator) that yields found URLs.
477 If the stop parameter is None the iterator will loop forever.
478 """
479 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
480 pause, only_standard, extra_params, tpe='vid')
481
482
483 # Shortcut to search shop.
484 def search_shop(query, tld='com', lang='en', tbs='0', safe='off', num=10,
485 start=0, stop=None, domains=None, pause=2.0,
486 only_standard=False, extra_params={}):
487 """
488 Shortcut to search shop.
489
490 :param str query: Query string. Must NOT be url-encoded.
491 :param str tld: Top level domain.
492 :param str lang: Language.
493 :param str tbs: Time limits (i.e "qdr:h" => last hour,
494 "qdr:d" => last 24 hours, "qdr:m" => last month).
495 :param str safe: Safe search.
496 :param int num: Number of results per page.
497 :param int start: First result to retrieve.
498 :param int or None stop: Last result to retrieve.
499 Use None to keep searching forever.
500 :param list of str or None domains: A list of web domains to constrain
501 the search.
502 :param float pause: Lapse to wait between HTTP requests.
503 A lapse too long will make the search slow, but a lapse too short may
504 cause Google to block your IP. Your mileage may vary!
505 :param bool only_standard: If True, only returns the standard results from
506 each page. If False, it returns every possible link from each page,
507 except for those that point back to Google itself. Defaults to False
508 for backwards compatibility with older versions of this module.
509 :param dict of str to str extra_params: A dictionary of extra HTTP GET
510 parameters, which must be URL encoded. For example if you don't want
511 Google to filter similar results you can set the extra_params to
512 {'filter': '0'} which will append '&filter=0' to every query.
513 :param str tpe: Search type (images, videos, news, shopping, books, apps)
514 Use the following values {videos: 'vid', images: 'isch',
515 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
516 :param str or None user_agent: User agent for the HTTP requests.
517 Use None for the default.
518
519 :rtype: generator of str
520 :return: Generator (iterator) that yields found URLs.
521 If the stop parameter is None the iterator will loop forever.
522 """
523 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
524 pause, only_standard, extra_params, tpe='shop')
525
526
527 # Shortcut to search books.
528 def search_books(query, tld='com', lang='en', tbs='0', safe='off', num=10,
529 start=0, stop=None, domains=None, pause=2.0,
530 only_standard=False, extra_params={}):
531 """
532 Shortcut to search books.
533
534 :param str query: Query string. Must NOT be url-encoded.
535 :param str tld: Top level domain.
536 :param str lang: Language.
537 :param str tbs: Time limits (i.e "qdr:h" => last hour,
538 "qdr:d" => last 24 hours, "qdr:m" => last month).
539 :param str safe: Safe search.
540 :param int num: Number of results per page.
541 :param int start: First result to retrieve.
542 :param int or None stop: Last result to retrieve.
543 Use None to keep searching forever.
544 :param list of str or None domains: A list of web domains to constrain
545 the search.
546 :param float pause: Lapse to wait between HTTP requests.
547 A lapse too long will make the search slow, but a lapse too short may
548 cause Google to block your IP. Your mileage may vary!
549 :param bool only_standard: If True, only returns the standard results from
550 each page. If False, it returns every possible link from each page,
551 except for those that point back to Google itself. Defaults to False
552 for backwards compatibility with older versions of this module.
553 :param dict of str to str extra_params: A dictionary of extra HTTP GET
554 parameters, which must be URL encoded. For example if you don't want
555 Google to filter similar results you can set the extra_params to
556 {'filter': '0'} which will append '&filter=0' to every query.
557 :param str tpe: Search type (images, videos, news, shopping, books, apps)
558 Use the following values {videos: 'vid', images: 'isch',
559 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
560 :param str or None user_agent: User agent for the HTTP requests.
561 Use None for the default.
562
563 :rtype: generator of str
564 :return: Generator (iterator) that yields found URLs.
565 If the stop parameter is None the iterator will loop forever.
566 """
567 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
568 pause, only_standard, extra_params, tpe='bks')
569
570
571 # Shortcut to search apps.
572 def search_apps(query, tld='com', lang='en', tbs='0', safe='off', num=10,
573 start=0, stop=None, domains=None, pause=2.0,
574 only_standard=False, extra_params={}):
575 """
576 Shortcut to search apps.
577
578 :param str query: Query string. Must NOT be url-encoded.
579 :param str tld: Top level domain.
580 :param str lang: Language.
581 :param str tbs: Time limits (i.e "qdr:h" => last hour,
582 "qdr:d" => last 24 hours, "qdr:m" => last month).
583 :param str safe: Safe search.
584 :param int num: Number of results per page.
585 :param int start: First result to retrieve.
586 :param int or None stop: Last result to retrieve.
587 Use None to keep searching forever.
588 :param list of str or None domains: A list of web domains to constrain
589 the search.
590 :param float pause: Lapse to wait between HTTP requests.
591 A lapse too long will make the search slow, but a lapse too short may
592 cause Google to block your IP. Your mileage may vary!
593 :param bool only_standard: If True, only returns the standard results from
594 each page. If False, it returns every possible link from each page,
595 except for those that point back to Google itself. Defaults to False
596 for backwards compatibility with older versions of this module.
597 :param dict of str to str extra_params: A dictionary of extra HTTP GET
598 parameters, which must be URL encoded. For example if you don't want
599 Google to filter similar results you can set the extra_params to
600 {'filter': '0'} which will append '&filter=0' to every query.
601 :param str tpe: Search type (images, videos, news, shopping, books, apps)
602 Use the following values {videos: 'vid', images: 'isch',
603 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
604 :param str or None user_agent: User agent for the HTTP requests.
605 Use None for the default.
606
607 :rtype: generator of str
608 :return: Generator (iterator) that yields found URLs.
609 If the stop parameter is None the iterator will loop forever.
610 """
611 return search(query, tld, lang, tbs, safe, num, start, stop, domains,
612 pause, only_standard, extra_params, tpe='app')
613
614
615 # Shortcut to single-item search.
616 # Evaluates the iterator to return the single URL as a string.
617 def lucky(query, tld='com', lang='en', tbs='0', safe='off',
618 only_standard=False, extra_params={}, tpe=''):
619 """
620 Shortcut to single-item search.
621
622 :param str query: Query string. Must NOT be url-encoded.
623 :param str tld: Top level domain.
624 :param str lang: Language.
625 :param str tbs: Time limits (i.e "qdr:h" => last hour,
626 "qdr:d" => last 24 hours, "qdr:m" => last month).
627 :param str safe: Safe search.
628 :param int num: Number of results per page.
629 :param int start: First result to retrieve.
630 :param int or None stop: Last result to retrieve.
631 Use None to keep searching forever.
632 :param list of str or None domains: A list of web domains to constrain
633 the search.
634 :param float pause: Lapse to wait between HTTP requests.
635 A lapse too long will make the search slow, but a lapse too short may
636 cause Google to block your IP. Your mileage may vary!
637 :param bool only_standard: If True, only returns the standard results from
638 each page. If False, it returns every possible link from each page,
639 except for those that point back to Google itself. Defaults to False
640 for backwards compatibility with older versions of this module.
641 :param dict of str to str extra_params: A dictionary of extra HTTP GET
642 parameters, which must be URL encoded. For example if you don't want
643 Google to filter similar results you can set the extra_params to
644 {'filter': '0'} which will append '&filter=0' to every query.
645 :param str tpe: Search type (images, videos, news, shopping, books, apps)
646 Use the following values {videos: 'vid', images: 'isch',
647 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
648 :param str or None user_agent: User agent for the HTTP requests.
649 Use None for the default.
650
651 :rtype: str
652 :return: URL found by Google.
653 """
654 gen = search(query, tld, lang, tbs, safe, 1, 0, 1, 0., only_standard,
655 extra_params, tpe)
656 return next(gen)
657
658
659 # Returns only the number of Google hits for the given search query.
660 # This is the number reported by Google itself, NOT by scraping.
661 def hits(query, tld='com', lang='en', tbs='0', safe='off',
662 domains=None, extra_params={}, tpe='', user_agent=None):
663 """
664 Search the given query string using Google and return the number of hits.
665
666 :note: This is the number reported by Google itself, NOT by scraping.
667
668 :param str query: Query string. Must NOT be url-encoded.
669 :param str tld: Top level domain.
670 :param str lang: Language.
671 :param str tbs: Time limits (i.e "qdr:h" => last hour,
672 "qdr:d" => last 24 hours, "qdr:m" => last month).
673 :param str safe: Safe search.
674 :param int num: Number of results per page.
675 :param int start: First result to retrieve.
676 :param int or None stop: Last result to retrieve.
677 Use None to keep searching forever.
678 :param list of str or None domains: A list of web domains to constrain
679 the search.
680 :param float pause: Lapse to wait between HTTP requests.
681 A lapse too long will make the search slow, but a lapse too short may
682 cause Google to block your IP. Your mileage may vary!
683 :param bool only_standard: If True, only returns the standard results from
684 each page. If False, it returns every possible link from each page,
685 except for those that point back to Google itself. Defaults to False
686 for backwards compatibility with older versions of this module.
687 :param dict of str to str extra_params: A dictionary of extra HTTP GET
688 parameters, which must be URL encoded. For example if you don't want
689 Google to filter similar results you can set the extra_params to
690 {'filter': '0'} which will append '&filter=0' to every query.
691 :param str tpe: Search type (images, videos, news, shopping, books, apps)
692 Use the following values {videos: 'vid', images: 'isch',
693 news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
694 :param str or None user_agent: User agent for the HTTP requests.
695 Use None for the default.
696
697 :rtype: int
698 :return: Number of Google hits for the given search query.
699 """
700
701 # Prepare domain list if it exists.
702 if domains:
703 domain_query = '+OR+'.join('site:' + domain for domain in domains)
704 domain_query = '+' + domain_query
705 else:
706 domain_query = ''
707
708 # Prepare the search string.
709 query = quote_plus(query + domain_query)
710
711 # Check extra_params for overlapping
712 for builtin_param in ('hl', 'q', 'btnG', 'tbs', 'safe', 'tbm'):
713 if builtin_param in extra_params.keys():
714 raise ValueError(
715 'GET parameter "%s" is overlapping with \
716 the built-in GET parameter',
717 builtin_param
718 )
719
720 # Grab the cookie from the home page.
721 get_page(url_home % vars(), user_agent)
722
723 # Prepare the URL of the first (and in this cases ONLY) request.
724 url = url_search % vars()
725
726 try: # Is it python<3?
727 iter_extra_params = extra_params.iteritems()
728 except AttributeError: # Or python>3?
729 iter_extra_params = extra_params.items()
730 # Append extra GET_parameters to URL
731 for k, v in iter_extra_params:
732 url += url + ('&%s=%s' % (k, v))
733
734 # Request the Google Search results page.
735 html = get_page(url, user_agent)
736
737 # Parse the response.
738 if is_bs4:
739 soup = BeautifulSoup(html, 'html.parser')
740 else:
741 soup = BeautifulSoup(html)
742
743 # Get the number of hits.
744 tag = soup.find_all(attrs={"class": "sd", "id": "resultStats"})[0]
745 hits_text_parts = tag.text.split()
746 if len(hits_text_parts) < 3:
747 return 0
748 return int(hits_text_parts[1].replace(',', '').replace('.', ''))
749
750
751 def ngd(term1, term2):
752 """
753 Return the Normalized Google distance between words.
754
755 For more info, refer to:
756 https://en.wikipedia.org/wiki/Normalized_Google_distance
757
758 :param str term1: First term to compare.
759 :param str term2: Second term to compare.
760
761 :rtype: float
762 :return: Normalized Google distance between words.
763 """
764
765 lhits1 = math.log10(hits(term1))
766 lhits2 = math.log10(hits(term2))
767 lhits_mix = math.log10(hits('"' + term1 + '" "' + term2 + '"'))
768 npages = hits('the')
769 fix = 1000
770
771 lN = math.log10(npages * fix)
772 numerator = max([lhits1, lhits2]) - lhits_mix
773 denomin = lN - min([lhits1, lhits2])
774
775 return numerator / denomin
0 beautifulsoup4>=4.0
0 #!/usr/bin/env python
1
2 # Python bindings to the Google search engine
3 # Copyright (c) 2009-2016, Mario Vilas
4 # All rights reserved.
5 #
6 # Redistribution and use in source and binary forms, with or without
7 # modification, are permitted provided that the following conditions are met:
8 #
9 # * Redistributions of source code must retain the above copyright notice,
10 # this list of conditions and the following disclaimer.
11 # * Redistributions in binary form must reproduce the above copyright
12 # notice,this list of conditions and the following disclaimer in the
13 # documentation and/or other materials provided with the distribution.
14 # * Neither the name of the copyright holder nor the names of its
15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission.
17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 # POSSIBILITY OF SUCH DAMAGE.
29
30 import sys
31
32 from googlesearch import search, get_random_user_agent
33
34 from optparse import OptionParser, IndentedHelpFormatter
35
36 class BannerHelpFormatter(IndentedHelpFormatter):
37
38 "Just a small tweak to optparse to be able to print a banner."
39
40 def __init__(self, banner, *argv, **argd):
41 self.banner = banner
42 IndentedHelpFormatter.__init__(self, *argv, **argd)
43
44 def format_usage(self, usage):
45 msg = IndentedHelpFormatter.format_usage(self, usage)
46 return '%s\n%s' % (self.banner, msg)
47
48 # Parse the command line arguments.
49 formatter = BannerHelpFormatter(
50 "Python script to use the Google search engine\n"
51 "By Mario Vilas (mvilas at gmail dot com)\n"
52 "https://github.com/MarioVilas/googlesearch\n"
53 )
54 parser = OptionParser(formatter=formatter)
55 parser.set_usage("%prog [options] query")
56 parser.add_option("--tld", metavar="TLD", type="string", default="com",
57 help="top level domain to use [default: com]")
58 parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
59 help="produce results in the given language [default: en]")
60 parser.add_option("--domains", metavar="DOMAINS", type="string", default="",
61 help="comma separated list of domains to constrain the search to")
62 parser.add_option("--tbs", metavar="TBS", type="string", default="0",
63 help="produce results from period [default: 0]")
64 parser.add_option("--safe", metavar="SAFE", type="string", default="off",
65 help="kids safe search [default: off]")
66 parser.add_option("--num", metavar="NUMBER", type="int", default=10,
67 help="number of results per page [default: 10]")
68 parser.add_option("--start", metavar="NUMBER", type="int", default=0,
69 help="first result to retrieve [default: 0]")
70 parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
71 help="last result to retrieve [default: unlimited]")
72 parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
73 help="pause between HTTP requests [default: 2.0]")
74 parser.add_option("--rua", metavar="USERAGENT", action="store_true", default=False,
75 help="Randomize the User-Agent [default: no]")
76 parser.add_option("--all", dest="only_standard",
77 action="store_false", default=True,
78 help="grab all possible links from result pages [default: only standard results]")
79 (options, args) = parser.parse_args()
80 query = ' '.join(args)
81 if not query:
82 parser.print_help()
83 sys.exit(2)
84 params = [(k, v) for (k, v) in options.__dict__.items() if not k.startswith('_')]
85 params = dict(params)
86
87 # Split the comma separated list of domains, if present.
88 if 'domains' in params:
89 params['domains'] = [x.strip() for x in params['domains'].split(',')]
90
91 # Randomize the user agent if requested.
92 if "rua" in params:
93 rua = params.pop("rua")
94 if rua:
95 params["user_agent"] = get_random_user_agent()
96
97 # Run the query.
98 for url in search(query, **params):
99 print(url)
100 try:
101 sys.stdout.flush()
102 except:
103 pass
0 [bdist_wheel]
1 universal = 1
2
3 [egg_info]
4 tag_build =
5 tag_date = 0
6
0 #!/usr/bin/env python
1
2 # Copyright (c) 2009-2019, Mario Vilas
3 # All rights reserved.
4 #
5 # Redistribution and use in source and binary forms, with or without
6 # modification, are permitted provided that the following conditions are met:
7 #
8 # * Redistributions of source code must retain the above copyright notice,
9 # this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above copyright
11 # notice,this list of conditions and the following disclaimer in the
12 # documentation and/or other materials provided with the distribution.
13 # * Neither the name of the copyright holder nor the names of its
14 # contributors may be used to endorse or promote products derived from
15 # this software without specific prior written permission.
16 #
17 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 # POSSIBILITY OF SUCH DAMAGE.
28
29 from os import chdir
30 from os.path import abspath, join, split
31
32 # Make sure we are standing in the correct directory.
33 # Old versions of distutils didn't take care of this.
34 here = split(abspath(__file__))[0]
35 chdir(here)
36
37 # Package metadata.
38 metadata = dict(
39 name='google',
40 provides=['googlesearch'],
41 requires=['beautifulsoup4'],
42 packages=['googlesearch'],
43 scripts=[join('scripts', 'google')],
44 package_data={'googlesearch': ['user_agents.txt.gz']},
45 include_package_data=True,
46 version="2.0.2",
47 description="Python bindings to the Google search engine.",
48 author="Mario Vilas",
49 author_email="[email protected]",
50 url="http://breakingcode.wordpress.com/",
51 classifiers=[
52 "Development Status :: 5 - Production/Stable",
53 "Intended Audience :: Developers",
54 "License :: OSI Approved :: BSD License",
55 "Environment :: Console",
56 "Programming Language :: Python",
57 "Topic :: Software Development :: Libraries :: Python Modules",
58 ],
59 )
60
61 # Prefer setuptools over the old distutils.
62 # If setuptools is available, use install_requires.
63 try:
64 from setuptools import setup
65 metadata['install_requires'] = metadata['requires']
66 except ImportError:
67 from distutils.core import setup
68
69 # Get the long description from the readme file.
70 try:
71 metadata['long_description'] = open(join(here, 'README.md'), 'rU').read()
72 except Exception:
73 pass
74
75 # Run the setup script.
76 setup(**metadata)