modules/crawler.py - finalrecon (upstream/1.1.2+git20220110.1.befcebe)

Tree @upstream/1.1.2+git20220110.1.befcebe (Download .tar.gz)

crawler.py @upstream/1.1.2+git20220110.1.befcebe — raw · history · blame

#!/usr/bin/env python3

import os
import re
import bs4
import lxml
import json
import asyncio
import requests
import threading
import tldextract
from datetime import date
requests.packages.urllib3.disable_warnings()

R = '\033[31m' # red
G = '\033[32m' # green
C = '\033[36m' # cyan
W = '\033[0m'  # white
Y = '\033[33m' # yellow

user_agent = {
	'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'
	}

soup = ''
r_url = ''
sm_url = ''
total = []
r_total = []
sm_total = []
js_total = []
css_total = []
int_total = []
ext_total = []
img_total = []
js_crawl_total = []
sm_crawl_total = []
wayback_total = []

def crawler(target, output, data):
	global soup, r_url, sm_url
	print('\n' + Y + '[!]' + Y + ' Starting Crawler...' + W + '\n')

	try:
		rqst = requests.get(target, headers=user_agent, verify=False, timeout=10)
	except Exception as e:
		print(R + '[-] Exception : ' + C + str(e) + W)
		return

	sc = rqst.status_code
	if sc == 200:
		page = rqst.content
		soup = bs4.BeautifulSoup(page, 'lxml')

		protocol = target.split('://')
		protocol = protocol[0]
		temp_tgt = target.split('://')[1]
		pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}'
		custom = bool(re.match(pattern, temp_tgt))
		if custom == True:
			r_url = protocol + '://' + temp_tgt + '/robots.txt'
			sm_url = protocol + '://' + temp_tgt + '/sitemap.xml'
		else:
			ext = tldextract.extract(target)
			hostname = '.'.join(part for part in ext if part)
			r_url = protocol + '://' + hostname + '/robots.txt'
			sm_url = protocol + '://' + hostname + '/sitemap.xml'

		loop = asyncio.new_event_loop()
		asyncio.set_event_loop(loop)
		tasks = asyncio.gather(
			robots(target),
			sitemap(),
			css(target),
			js(target),
			internal_links(target),
			external_links(target),
			images(target),
			sm_crawl(),
			js_crawl(),
			wayback(target))
		loop.run_until_complete(tasks)
		loop.close()

		out(target, output, data)
	else:
		print (R + '[-]' + C + ' Status : ' + W + str(sc))

def url_filter(target):
	global url

	if all([url.startswith('/') == True, url.startswith('//') == False]):
		url = target + url
	else:
		pass

	if all([url.find('http://') == -1,
		url.find('https://') == -1]):

		url = url.replace('//', 'http://')
		url = url.replace('../', target + '/')
		url = url.replace('./', target + '/')
	else:
		pass

	if all([url.find('//') == -1,
		url.find('../') == -1,
		url.find('./') == -1,
		url.find('http://') == -1,
		url.find('https://') == -1]):

		url = target + '/' + url
	else:
		pass

async def wayback(target):
	global wayback_total
	is_avail = False
	ext = tldextract.extract(target)
	domain = ext.registered_domain
	if len(domain) < 2:
		domain = ext.domain
	domain_query = domain + '/*'

	#today = date.today().strftime("%Y%m%d")
	#past = date.today() + relativedelta(months=-6)
	#past = past.strftime("%Y%m%d")

	curr_yr = date.today().year
	last_yr = curr_yr - 1

	print(Y + '[!]' + C + ' Checking Availability on Wayback Machine' + W, end = '')
	wm_avail = 'http://archive.org/wayback/available'
	avail_data = { 'url': domain }

	try:
		check_rqst = requests.get(wm_avail, params=avail_data, timeout=10)
		check_sc = check_rqst.status_code
		if check_sc == 200:
			check_data = check_rqst.text
			json_chk_data = json.loads(check_data)
			avail_data = json_chk_data['archived_snapshots']
			if len(avail_data) != 0:
				is_avail = True
				print(G + '['.rjust(5, '.') + ' Available ]')
			else:
				print(R + '['.rjust(5, '.') + ' N/A ]')
		else:
			print('\n' + R + '[-] Status : ' + C + str(check_sc) + W)
	except Exception as e:
		print('\n' + R + '[-] Exception : ' + C + str(e) + W)

	if is_avail == True:
		print(Y + '[!]' + C + ' Requesting Wayback Machine' + W, end = '')
		wm_url = 'http://web.archive.org/cdx/search/cdx'

		data = {
	    	'url': domain_query,
	    	'fl': 'original',
	    	'fastLatest': 'true',
			'from': '{}'.format(str(last_yr)),
			'to': '{}'.format(str(curr_yr)),
			'filter': 'statuscode:200'
		}

		try:
			r = requests.get(wm_url, params=data)
			r_sc = r.status_code
			if r_sc == 200:
				r_data = r.text
				if len(r_data) != 0:
					r_data = r_data.split('\n')
					r_data = set(r_data)
					print(G + '['.rjust(5, '.') + ' {} ]'.format(str(len(r_data))))
					wayback_total.extend(r_data)
				else:
					print(R + '['.rjust(5, '.') + ' Not Found ]' + W)
			else:
				print(R + '['.rjust(5, '.') + ' {} ]'.format(r_sc) + W)
		except Exception as e:
			print('\n' + R + '[-] Exception : ' + C + str(e) + W)

async def robots(target):
	global url, r_url, r_total
	print(G + '[+]' + C + ' Looking for robots.txt' + W, end = '')

	try:
		r_rqst = requests.get(r_url, headers=user_agent, verify=False, timeout=10)
		r_sc = r_rqst.status_code
		if r_sc == 200:
			print(G + '['.rjust(9, '.') + ' Found ]' + W)
			print(G + '[+]' + C + ' Extracting robots Links', end = '')
			r_page = r_rqst.text
			r_scrape = r_page.split('\n')
			for entry in r_scrape:
				if (entry.find('Disallow') == 0 or
					entry.find('Allow') == 0 or
					entry.find('Sitemap') == 0):

					url = entry.split(': ')
					try:
						url = url[1]
						url = url.strip()
						url_filter(target)
						r_total.append(url)
						if url.endswith('xml') == True:
							sm_total.append(url)
					except:
						pass

			r_total = set(r_total)

			print(G + '['.rjust(8, '.') + ' {} ]'.format(str(len(r_total))))

		elif r_sc == 404:
			print(R + '['.rjust(9, '.') + ' Not Found ]' + W)
		else:
			print(R + '['.rjust(9, '.') + ' {} ]'.format(r_sc) + W)
	except Exception as e:
		print('\n' + R + '[-] Exception : ' + C + str(e) + W)

async def sitemap():
	global url, sm_url, total, sm_total
	print(G + '[+]' + C + ' Looking for sitemap.xml' + W, end = '')
	try:
		sm_rqst = requests.get(sm_url, headers=user_agent, verify=False, timeout=10)
		sm_sc = sm_rqst.status_code
		if sm_sc == 200:
			print(G + '['.rjust(8, '.') + ' Found ]' + W)
			print(G + '[+]' + C + ' Extracting sitemap Links', end = '')
			sm_page = sm_rqst.content
			sm_soup = bs4.BeautifulSoup(sm_page, 'xml')
			links = sm_soup.find_all('loc')
			for url in links:
				url = url.get_text()
				if url != None:
					sm_total.append(url)

			sm_total = set(sm_total)

			print(G + '['.rjust(7, '.') + ' {} ]'.format(str(len(sm_total))))
		elif sm_sc == 404:
			print(R + '['.rjust(8, '.') + ' Not Found ]' + W)
		else:
			print(R + '['.rjust(8, '.') + ' {} ]'.format(sm_sc) + W)
	except Exception as e:
		print('\n' + R + '[-] Exception : ' + C + str(e))

async def css(target):
	global url, soup, total, css_total
	print(G + '[+]' + C + ' Extracting CSS Links' + W, end = '')
	css = soup.find_all('link')

	for link in css:
		url = link.get('href')
		if url != None and '.css' in url:
			url_filter(target)
			css_total.append(url)

	css_total = set(css_total)
	print(G + '['.rjust(11, '.') + ' {} ]'.format(str(len(css_total))) + W)

async def js(target):
	global url, total, js_total
	print(G + '[+]' + C + ' Extracting Javascript Links' + W, end = '')
	js = soup.find_all('script')

	for link in js:
		url = link.get('src')
		if url != None and '.js' in url:
			url_filter(target)
			js_total.append(url)

	js_total = set(js_total)
	print(G + '['.rjust(4, '.') + ' {} ]'.format(str(len(js_total))))

async def internal_links(target):
	global total, int_total
	print(G + '[+]' + C + ' Extracting Internal Links' + W, end = '')

	ext = tldextract.extract(target)
	domain = ext.registered_domain

	links = soup.find_all('a')
	for link in links:
		url = link.get('href')
		if url != None:
			if domain in url:
				int_total.append(url)

	int_total = set(int_total)
	print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(int_total))))

async def external_links(target):
	global total, ext_total
	print(G + '[+]' + C + ' Extracting External Links' + W, end = '')

	ext = tldextract.extract(target)
	domain = ext.registered_domain

	links = soup.find_all('a')
	for link in links:
		url = link.get('href')
		if url != None:
			if domain not in url and 'http' in url:
				ext_total.append(url)

	ext_total = set(ext_total)
	print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(ext_total))))

async def images(target):
	global url, total, img_total
	print(G + '[+]' + C + ' Extracting Images' + W, end = '')
	images = soup.find_all('img')

	for link in images:
		url = link.get('src')
		if url != None and len(url) > 1:
			url_filter(target)
			img_total.append(url)

	img_total = set(img_total)
	print(G + '['.rjust(14, '.') + ' {} ]'.format(str(len(img_total))))

async def sm_crawl():
	global sm_crawl_total
	print(G + '[+]' + C + ' Crawling Sitemaps' + W, end = '')

	threads = []
	
	def fetch(site_url):
		try:
			sm_rqst = requests.get(site_url, headers=user_agent, verify=False, timeout=10)
			sm_sc = sm_rqst.status_code
			if sm_sc == 200:
				sm_data = sm_rqst.content.decode()
				sm_soup = bs4.BeautifulSoup(sm_data, 'xml')
				links = sm_soup.find_all('loc')
				for url in links:
					url = url.get_text()
					if url != None:
						sm_crawl_total.append(url)
			elif sm_sc == 404:
				print(R + '['.rjust(8, '.') + ' Not Found ]' + W)
			else:
				print(R + '['.rjust(8, '.') + ' {} ]'.format(sm_sc) + W)
		except Exception as e:
			print('\n' + R + '[-] Exception : ' + C + str(e))

	for site_url in sm_total:
		if site_url != sm_url:
			if site_url.endswith('xml') == True:
				t = threading.Thread(target=fetch, args=[site_url])
				t.daemon = True
				threads.append(t)
				t.start()

	for thread in threads:
		thread.join()

	sm_crawl_total = set(sm_crawl_total)
	print(G + '['.rjust(14, '.') + ' {} ]'.format(str(len(sm_crawl_total))))

async def js_crawl():
	global js_crawl_total
	print(G + '[+]' + C + ' Crawling Javascripts' + W, end = '')

	threads = []

	def fetch(js_url):
		try:
			js_rqst = requests.get(js_url, headers=user_agent, verify=False, timeout=10)
			js_sc = js_rqst.status_code
			if js_sc == 200:
				js_data = js_rqst.content.decode()
				js_data = js_data.split(';')
				for line in js_data:
					if any(['http://' in line, 'https://' in line]):
						found = re.findall(r'\"(http[s]?://.*?)\"', line)
						for item in found:
							if len(item) > 8:
								js_crawl_total.append(item)
		except Exception as e:
			print('\n' + R + '[-] Exception : ' + C + str(e))

	for js_url in js_total:
		t = threading.Thread(target=fetch, args=[js_url])
		t.daemon = True
		threads.append(t)
		t.start()

	for thread in threads:
		thread.join()

	js_crawl_total = set(js_crawl_total)
	print(G + '['.rjust(11, '.') + ' {} ]'.format(str(len(js_crawl_total))))

def out(target, output, data):
	global total

	total.extend(r_total)
	total.extend(sm_total)
	total.extend(css_total)
	total.extend(js_total)
	total.extend(js_crawl_total)
	total.extend(sm_crawl_total)
	total.extend(int_total)
	total.extend(ext_total)
	total.extend(img_total)
	total.extend(wayback_total)
	total = set(total)

	print('\n' + G + '[+]' + C + ' Total Unique Links Extracted : ' + W + str(len(total)))

	if output != 'None':
		if len(total) != 0:
			data['module-Crawler'] = {'Total Unique Links Extracted': str(len(total))}
			try:
				target_title = soup.title.string
			except AttributeError:
				target_title = 'None'
			data['module-Crawler'].update({'Title ': str(target_title)})

			data['module-Crawler'].update(
				{
					'Count ( Robots )':      str(len(r_total)),
					'Count ( Sitemap )':     str(len(sm_total)),
					'Count ( CSS )':         str(len(css_total)),
					'Count ( JS )':          str(len(js_total)),
					'Count ( Links in JS )':       str(len(js_crawl_total)),
					'Count ( Links in Sitemaps )': str(len(sm_crawl_total)),
					'Count ( Internal )':    str(len(int_total)),
					'Count ( External )':    str(len(ext_total)),
					'Count ( Images )':      str(len(img_total)),
					'count ( Wayback Machine )': str(len(wayback_total)),
					'Count ( Total )': str(len(total))
				})
			
			if len(r_total) != 0:
				data['module-Crawler'].update({'Robots': list(r_total)})
			
			if len(sm_total) != 0:
				data['module-Crawler'].update({'Sitemaps': list(sm_total)})
			
			if len(css_total) != 0:
				data['module-Crawler'].update({'CSS': list(css_total)})
			
			if len(js_total) != 0:
				data['module-Crawler'].update({'Javascripts': list(js_total)})

			if len(js_crawl_total) != 0:
				data['module-Crawler'].update({'Links inside Javascripts': list(js_crawl_total)})
			
			if len(sm_crawl_total) != 0:
				data['module-Crawler'].update({'Links Inside Sitemaps': list(sm_crawl_total)})
			
			if len(int_total) != 0:
				data['module-Crawler'].update({'Internal Links': list(int_total)})
		
			if len(ext_total) != 0:
				data['module-Crawler'].update({'External Links': list(ext_total)})
			
			if len(img_total) != 0:
				data['module-Crawler'].update({'Images': list(img_total)})
			
			if len(wayback_total) != 0:
				data['module-Crawler'].update({'Wayback Machine': list(wayback_total)})