Codebase list osrframework / 5315caf osrframework / thirdparties / infobel_com / processing.py
5315caf

Tree @5315caf (Download .tar.gz)

processing.py @5315cafraw · history · blame

# !/usr/bin/python
# -*- coding: cp1252 -*-
#
##################################################################################
#
#    This program is part of OSRFramework. You can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
##################################################################################

import argparse
import json
import re
import sys

import osrframework.utils.browser as browser

def extractFieldsFromResult(data):
    '''
        Method that parses Infobel textual information to return a series of attributes.
        
        :return:    a list of i3visio-like objects.
    ''' 
    
    entities = []
    
    # Defining the objects to extract
    fieldsRegExp = {}
    fieldsRegExp["i3visio.fullname"] = "<span class=\"fn\">([^<]*)</span>"
    fieldsRegExp["i3visio.name"] = "    por <strong>[^ ]* ([^<]*)</strong>"        
    fieldsRegExp["i3visio.surname"] = "    por <strong>([^ ]*) "     
    fieldsRegExp["i3visio.location.address"] = "itemprop=\"streetAddress\">([^<]*)</span>"
    fieldsRegExp["i3visio.location.city"] = "addressLocality\">([^<]*)</span>"
    fieldsRegExp["i3visio.location.postalcode"] = "postalCode\">([^<]*)</span>"    
    fieldsRegExp["i3visio.phone"] = "document.write\('([0-9]+)'"        

    for field in fieldsRegExp.keys():
        listRecovered = re.findall(fieldsRegExp[field], data)
        if len(listRecovered) >0:
            aux = {}
            aux["type"]= field
            aux["value"] = listRecovered[0].replace('\xa0', ' ')
            aux["attributes"] = []
            entities.append(aux)
            
    return entities
    
def getResults(uri):
    '''
        Method that recovers the text for each result in infobel.com
        
        :param uri: Infobel uri
        
        :return:    A list of textual information to be processed
    '''
    # Using i3visio browser to avoid certain issues...
    i3Browser = browser.Browser()

    data = i3Browser.recoverURL(uri)        

    # Strings to be searched
    regExp = "<!-- Results -->(.*)<!-- /Results -->"
    # re.DOTALL is needed to match any character INCLUDING \n
    results = re.findall(regExp, data, re.DOTALL)
    
    return results