Codebase list osrframework / fad7703 osrframework / utils / browser.py
fad7703

Tree @fad7703 (Download .tar.gz)

browser.py @fad7703raw · history · blame

# !/usr/bin/python
# -*- coding: cp1252 -*-
#
##################################################################################
#
#    Copyright 2016 FĂ©lix Brezo and Yaiza Rubio (i3visio, [email protected])
#
#    This file is part of OSRFramework. You can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
##################################################################################

# Required libraries
import cookielib
import ConfigParser
import mechanize
import os
import random

import osrframework.utils.configuration as configuration

# logging imports
import logging

class Browser():
    """
        Utility used to code a Browser.
    """
    def __init__(self):
        """
            Recovering an instance of a new Browser.
        """

        # Browser
        self.br = mechanize.Browser()

        # Cookie Jar
        self.cj = cookielib.LWPCookieJar()
        self.br.set_cookiejar(self.cj)

        # Browser options
        self.br.set_handle_equiv(True)
        self.br.set_handle_gzip(False)
        self.br.set_handle_redirect(True)
        self.br.set_handle_referer(False)
        self.br.set_handle_robots(False)
        self.br.set_handled_schemes(['http', 'https'])

        # Follows refresh 0 but not hangs on refresh > 0
        self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Defining User Agents
        self.userAgents = []

        # Handling proxies
        self.proxies = {}
        self.timeout = 2

        # Want debugging messages?
        #self.br.set_debug_http(True)
        #self.br.set_debug_redirects(True)
        #self.br.set_debug_responses(True)

        # Trying to read the configuration
        # --------------------------------
        # If a current.cfg has not been found, creating it by copying from default
        configPath = configuration.getConfigPath("browser.cfg")
        configPath = os.path.join(configuration.getConfigPath()["appPath"], "browser.cfg")

        # Checking if the configuration file exists
        if not os.path.exists(configPath):
            try:
                # Copy the data from the default folder
                defaultConfigPath = os.path.join(configuration.getConfigPath()["appPathDefaults"], "browser.cfg")

                with open(defaultConfigPath) as iF:
                    cont = iF.read()
                    with open(configPath, "w") as oF:
                        oF.write(cont)
            except Exception, e:
                print("WARNING. No configuration file could be found and the default file was not found either, so configuration will be set as default.")
                print(str(e))
                print()
                # Storing configuration as default
                self.userAgents = ['Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36']
                self.proxies = {}

                return None

        # Reading the configuration file
        config = ConfigParser.ConfigParser()
        config.read(configPath)

        proxy = {}

        # Iterating through all the sections, which contain the platforms
        for conf in config.sections():
            if conf == "Browser":
                # Iterating through parametgers
                for (param, value) in config.items(conf):
                    if param == "user_agent":
                        if value != '':
                            self.userAgents.append(value)
                        else:
                            self.userAgents = ['Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/55.0.2883.87 Chrome/55.0.2883.87 Safari/537.36']
                    if param == "timeout":
                        try:
                            self.timeout = int(value)
                        except:
                            self.timeout = 2
            else:
                proxy[conf] = {}
                # Iterating through parametgers
                for (param, value) in config.items(conf):
                    if value != '':
                        proxy[conf][param] = value

        # Configuring the proxy as it will be used by br.set_proxies
        for p in proxy.keys():
            # p ~= ProxyHTTP --> Protocol = p.lower()[5:]
            #print p, p.lower()[5:], proxy[p]
            try:
                # Adding credentials if they exist
                self.proxies[ p.lower()[5:] ] = proxy[p]["username"] + ":" + proxy[p]["password"]  + "@" + proxy[p]["host"] + ":" + proxy[p]["port"]
            except:
                try:
                    self.proxies[ p.lower()[5:] ] = proxy[p]["host"] + ":" + proxy[p]["port"]
                except:
                    # We are not adding this protocol to be proxied
                    pass

    def recoverURL(self,url):
        """
            Public method to recover a resource.
                url
                Platform

            Returns:
                Returns a resource that has to be read, for instance, with html = self.br.read()
        """
        logger = logging.getLogger("osrframework.utils")

        # Configuring user agents...
        self.setUserAgent()

        # Configuring proxies
        if "https://" in url:
            self.setProxy(protocol = "https")
        else:
            self.setProxy(protocol = "http")

        # Giving special treatment for .onion platforms
        if ".onion" in url:
            try:
                # TODO: configuring manually the tor bundle
                pass
            except:
                # TODO: capturing the error and eventually trying the tor2web approach
                #url = url.replace(".onion", ".tor2web.org")
                pass
            url = url.replace(".onion", ".onion.cab")

        logger.debug("Retrieving the resource: " + url)
        # Opening the resource
        recurso = self.br.open(url, timeout=self.timeout)

        logger.debug("Reading html code from: " + url)
        # [TO-DO]
        #    Additional things may be done here to load javascript.
        html = recurso.read()
               
        return html

    def setNewPassword(self, url, username, password):
        """
            Public method to manually set the credentials for a url in the browser.
        """
        self.br.add_password(url, username, password)

    def setProxy(self, protocol="http"):
        """
            Public method to set a proxy for the browser.
        """
        # Setting proxy
        try:
            new = { protocol: self.proxies[protocol]}
            self.br.set_proxies( new )
        except:
            # No proxy defined for that protocol
            pass

    def setUserAgent(self, uA=None):
        """
            This method will be called whenever a new query will be executed.

            :param uA:    Any User Agent that was needed to be inserted. This parameter is optional.

            :return:    Returns True if a User Agent was inserted and False if no User Agent could be inserted.
        """
        logger = logging.getLogger("osrframework.utils")

        if not uA:
            # Setting the User Agents
            if self.userAgents:
                # User-Agent (this is cheating, ok?)
                logger = logging.debug("Selecting a new random User Agent.")
                uA = random.choice(self.userAgents)
            else:
                logger = logging.debug("No user agent was inserted.")
                return False

        #logger.debug("Setting the user agent:\t" + str(uA))

        self.br.addheaders = [ ('User-agent', uA), ]
        #self.br.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'), ('Accept-Encoding', 'none'), ('Accept-Language', 'es-es,es;q=0.8'), ('Connection', 'keep-alive')]
        #self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        return True