modules/crawler.py - finalrecon (d68695c4-a5a1-4cb1-b170-37893ed55967/upstream/1.1.5+git20220821.1.b35e85f)

Tree @d68695c4-a5a1-4cb1-b170-37893ed55967/upstream/1.1.5+git20220821.1.b35e85f (Download .tar.gz)

crawler.py @d68695c4-a5a1-4cb1-b170-37893ed55967/upstream/1.1.5+git20220821.1.b35e85f — raw · history · blame

#!/usr/bin/env python3

import re
import bs4
import lxml
import json
import asyncio
import requests
import threading
import tldextract
from datetime import date
from modules.export import export
requests.packages.urllib3.disable_warnings()

R = '\033[31m'  # red
G = '\033[32m'  # green
C = '\033[36m'  # cyan
W = '\033[0m'   # white
Y = '\033[33m'  # yellow

user_agent = {'User-Agent': 'FinalRecon'}

soup = ''
total = []
r_total = []
sm_total = []
js_total = []
css_total = []
int_total = []
ext_total = []
img_total = []
js_crawl_total = []
sm_crawl_total = []


def crawler(target, output, data):
	global soup, r_url, sm_url
	print(f'\n{Y}[!] Starting Crawler...{W}\n')

	try:
		rqst = requests.get(target, headers=user_agent, verify=False, timeout=10)
	except Exception as e:
		print(f'{R} [-] Exception : {C}{e}{W}')
		return

	sc = rqst.status_code
	if sc == 200:
		page = rqst.content
		soup = bs4.BeautifulSoup(page, 'lxml')

		protocol = target.split('://')
		protocol = protocol[0]
		temp_tgt = target.split('://')[1]
		pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{2,5}'
		custom = bool(re.match(pattern, temp_tgt))
		if custom is True:
			r_url = f'{protocol}://{temp_tgt}/robots.txt'
			sm_url = f'{protocol}://{temp_tgt}/sitemap.xml'
			base_url = f'{protocol}://{temp_tgt}'
		else:
			ext = tldextract.extract(target)
			hostname = '.'.join(part for part in ext if part)
			base_url = f'{protocol}://{hostname}'
			r_url = f'{base_url}/robots.txt'
			sm_url = f'{base_url}/sitemap.xml'

		loop = asyncio.new_event_loop()
		asyncio.set_event_loop(loop)
		tasks = asyncio.gather(
			robots(r_url, base_url, data, output),
			sitemap(sm_url, data, output),
			css(target, data, output),
			js(target, data, output),
			internal_links(target, data, output),
			external_links(target, data, output),
			images(target, data, output),
			sm_crawl(data, output),
			js_crawl(data, output))
		loop.run_until_complete(tasks)
		loop.close()
		stats(output, data)
	else:
		print(f'{R}[-] {C}Status : {W}{sc}')


def url_filter(target, link):
	if all([link.startswith('/') is True, link.startswith('//') is False]):
		ret_url = target + link
		return ret_url
	else:
		pass

	if link.startswith('//') is True:
		ret_url = link.replace('//', 'http://')
		return ret_url
	else:
		pass

	if all([
		link.find('//') == -1,
		link.find('../') == -1,
		link.find('./') == -1,
		link.find('http://') == -1,
		link.find('https://') == -1]
	):
		ret_url = f'{target}/{link}'
		return ret_url
	else:
		pass

	if all([
		link.find('http://') == -1,
		link.find('https://') == -1]
	):
		ret_url = link.replace('//', 'http://')
		ret_url = link.replace('../', f'{target}/')
		ret_url = link.replace('./', f'{target}/')
		return ret_url
	else:
		pass
	return link

async def robots(robo_url, base_url, data, output):
	global r_total
	print(f'{G}[+] {C}Looking for robots.txt{W}', end='', flush=True)

	try:
		r_rqst = requests.get(robo_url, headers=user_agent, verify=False, timeout=10)
		r_sc = r_rqst.status_code
		if r_sc == 200:
			print(G + '['.rjust(9, '.') + ' Found ]' + W)
			print(f'{G}[+] {C}Extracting robots Links{W}', end='', flush=True)
			r_page = r_rqst.text
			r_scrape = r_page.split('\n')
			for entry in r_scrape:
				if any([
					entry.find('Disallow') == 0,
					entry.find('Allow') == 0,
					entry.find('Sitemap') == 0]):

					url = entry.split(': ')
					try:
						url = url[1]
						url = url.strip()
						tmp_url = url_filter(base_url, url)
						if tmp_url is not None:
							r_total.append(url_filter(base_url, url))
						if url.endswith('xml') is True:
							sm_total.append(url)
					except Exception:
						pass

			r_total = set(r_total)
			print(G + '['.rjust(8, '.') + ' {} ]'.format(str(len(r_total))))
			exporter(data, output, r_total, 'robots')
		elif r_sc == 404:
			print(R + '['.rjust(9, '.') + ' Not Found ]' + W)
		else:
			print(R + '['.rjust(9, '.') + ' {} ]'.format(r_sc) + W)
	except Exception as e:
		print(f'\n{R}[-] Exception : {C}{e}{W}')


async def sitemap(sm_url, data, output):
	global sm_total
	print(f'{G}[+] {C}Looking for sitemap.xml{W}', end='', flush=True)
	try:
		sm_rqst = requests.get(sm_url, headers=user_agent, verify=False, timeout=10)
		sm_sc = sm_rqst.status_code
		if sm_sc == 200:
			print(G + '['.rjust(8, '.') + ' Found ]' + W)
			print(f'{G}[+] {C}Extracting sitemap Links{W}', end='', flush=True)
			sm_page = sm_rqst.content
			sm_soup = bs4.BeautifulSoup(sm_page, 'xml')
			links = sm_soup.find_all('loc')
			for url in links:
				url = url.get_text()
				if url is not None:
					sm_total.append(url)

			sm_total = set(sm_total)
			print(G + '['.rjust(7, '.') + ' {} ]'.format(str(len(sm_total))))
			exporter(data, output, sm_total, 'sitemap')
		elif sm_sc == 404:
			print(R + '['.rjust(8, '.') + ' Not Found ]' + W)
		else:
			print(f'{R}{"[".rjust(8, ".")} Status Code : {sm_sc} ]{W}')
	except Exception as e:
		print(f'\n{R}[-] Exception : {C}{e}{W}')


async def css(target, data, output):
	global css_total
	print(f'{G}[+] {C}Extracting CSS Links{W}', end='', flush=True)
	css = soup.find_all('link', href=True)

	for link in css:
		url = link.get('href')
		if url is not None and '.css' in url:
			css_total.append(url_filter(target, url))

	css_total = set(css_total)
	print(G + '['.rjust(11, '.') + ' {} ]'.format(str(len(css_total))) + W)
	exporter(data, output, css_total, 'css')


async def js(target, data, output):
	global total, js_total
	print(f'{G}[+] {C}Extracting Javascript Links{W}', end='', flush=True)
	scr_tags = soup.find_all('script', src=True)

	for link in scr_tags:
		url = link.get('src')
		if url is not None and '.js' in url:
			tmp_url = url_filter(target, url)
			if tmp_url is not None:
				js_total.append(tmp_url)

	js_total = set(js_total)
	print(G + '['.rjust(4, '.') + ' {} ]'.format(str(len(js_total))))
	exporter(data, output, js_total, 'javascripts')


async def internal_links(target, data, output):
	global total, int_total
	print(f'{G}[+] {C}Extracting Internal Links{W}', end='', flush=True)

	ext = tldextract.extract(target)
	domain = ext.registered_domain

	links = soup.find_all('a')
	for link in links:
		url = link.get('href')
		if url is not None:
			if domain in url:
				int_total.append(url)

	int_total = set(int_total)
	print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(int_total))))
	exporter(data, output, int_total, 'internal_urls')


async def external_links(target, data, output):
	global total, ext_total
	print(f'{G}[+] {C}Extracting External Links{W}', end='', flush=True)

	ext = tldextract.extract(target)
	domain = ext.registered_domain

	links = soup.find_all('a')
	for link in links:
		url = link.get('href')
		if url is not None:
			if domain not in url and 'http' in url:
				ext_total.append(url)

	ext_total = set(ext_total)
	print(G + '['.rjust(6, '.') + ' {} ]'.format(str(len(ext_total))))
	exporter(data, output, ext_total, 'external_urls')


async def images(target, data, output):
	global total, img_total
	print(f'{G}[+] {C}Extracting Images{W}', end='', flush=True)
	image_tags = soup.find_all('img')

	for link in image_tags:
		url = link.get('src')
		if url is not None and len(url) > 1:
			img_total.append(url_filter(target, url))

	img_total = set(img_total)
	print(G + '['.rjust(14, '.') + ' {} ]'.format(str(len(img_total))))
	exporter(data, output, img_total, 'images')


async def sm_crawl(data, output):
	global sm_crawl_total
	print(f'{G}[+] {C}Crawling Sitemaps{W}', end='', flush=True)

	threads = []

	def fetch(site_url):
		try:
			sm_rqst = requests.get(site_url, headers=user_agent, verify=False, timeout=10)
			sm_sc = sm_rqst.status_code
			if sm_sc == 200:
				sm_data = sm_rqst.content.decode()
				sm_soup = bs4.BeautifulSoup(sm_data, 'xml')
				links = sm_soup.find_all('loc')
				for url in links:
					url = url.get_text()
					if url is not None:
						sm_crawl_total.append(url)
			elif sm_sc == 404:
				# print(R + '['.rjust(8, '.') + ' Not Found ]' + W)
				pass
			else:
				# print(R + '['.rjust(8, '.') + ' {} ]'.format(sm_sc) + W)
				pass
		except Exception:
			# print(f'\n{R}[-] Exception : {C}{e}{W}')
			pass

	for site_url in sm_total:
		if site_url != sm_url:
			if site_url.endswith('xml') is True:
				t = threading.Thread(target=fetch, args=[site_url])
				t.daemon = True
				threads.append(t)
				t.start()

	for thread in threads:
		thread.join()

	sm_crawl_total = set(sm_crawl_total)
	print(G + '['.rjust(14, '.') + ' {} ]'.format(str(len(sm_crawl_total))))
	exporter(data, output, sm_crawl_total, 'urls_inside_sitemap')


async def js_crawl(data, output):
	global js_crawl_total
	print(f'{G}[+] {C}Crawling Javascripts{W}', end='', flush=True)

	threads = []

	def fetch(js_url):
		try:
			js_rqst = requests.get(js_url, headers=user_agent, verify=False, timeout=10)
			js_sc = js_rqst.status_code
			if js_sc == 200:
				js_data = js_rqst.content.decode()
				js_data = js_data.split(';')
				for line in js_data:
					if any(['http://' in line, 'https://' in line]):
						found = re.findall(r'\"(http[s]?://.*?)\"', line)
						for item in found:
							if len(item) > 8:
								js_crawl_total.append(item)
		except Exception as e:
			print(f'\n{R}[-] Exception : {C}{e}{W}')

	for js_url in js_total:
		t = threading.Thread(target=fetch, args=[js_url])
		t.daemon = True
		threads.append(t)
		t.start()

	for thread in threads:
		thread.join()

	js_crawl_total = set(js_crawl_total)
	print(G + '['.rjust(11, '.') + ' {} ]'.format(str(len(js_crawl_total))))
	exporter(data, output, js_crawl_total, 'urls_inside_js')


def exporter(data, output, list_name, file_name):
	data[f'module-crawler-{file_name}'] = ({'links': list(list_name)})
	data[f'module-crawler-{file_name}'].update({'exported': False})
	fname = f'{output["directory"]}/{file_name}.{output["format"]}'
	output['file'] = fname
	export(output, data)


def stats(output, data):
	global total

	total.extend(r_total)
	total.extend(sm_total)
	total.extend(css_total)
	total.extend(js_total)
	total.extend(js_crawl_total)
	total.extend(sm_crawl_total)
	total.extend(int_total)
	total.extend(ext_total)
	total.extend(img_total)
	total = set(total)

	print(f'\n{G}[+] {C}Total Unique Links Extracted : {W}{len(total)}')

	if output != 'None':
		if len(total) != 0:
			data['module-crawler-stats'] = {'Total Unique Links Extracted': str(len(total))}
			try:
				target_title = soup.title.string
			except AttributeError:
				target_title = 'None'
			data['module-crawler-stats'].update({'Title ': str(target_title)})

			data['module-crawler-stats'].update(
				{
					'total_urls_robots': len(r_total),
					'total_urls_sitemap': len(sm_total),
					'total_urls_css': len(css_total),
					'total_urls_js': len(js_total),
					'total_urls_in_js': len(js_crawl_total),
					'total_urls_in_sitemaps': len(sm_crawl_total),
					'total_urls_internal': len(int_total),
					'total_urls_external': len(ext_total),
					'total_urls_images': len(img_total),
					'total_urls': len(total)
				})
			data['module-crawler-stats'].update({'exported': False})