diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7596c0281eb7e7badb8b3a8788108d21b3d90470..feb2933fc6fda30c6d39b0670d87b5d2bc7aa492 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -46,6 +46,7 @@ build:scheduling-abstraction-layer: script: - $SCHEDULING_ABSTRACTION_LAYER_CLI clean install artifacts: + expire_in: 1 week paths: - /builds/melodic/morphemic-preprocessor/maven_repo/org/activeeon/scheduling-abstraction-layer/ @@ -58,6 +59,7 @@ build:amq-message-java-library: script: - $AMQ_MESSAGE_JAVA_LIBRARY_CLI clean install artifacts: + expire_in: 1 week paths: - /builds/melodic/morphemic-preprocessor/maven_repo/gr/ntua/imu/morphemic/amq-message-java-library/ @@ -82,6 +84,7 @@ build:slo-severity-calculator: script: - $SLO_SEVERITY_CALCULATOR_CLI -Dtest=!UnboundedMonitoringAttributeTests,!ConnectivityTests clean install artifacts: + expire_in: 1 week paths: - /builds/melodic/morphemic-preprocessor/maven_repo/gr/ntua/imu/morphemic/SLOSeverityCalculator/ diff --git a/deployment/arima/src/model_predict.py b/deployment/arima/src/model_predict.py index 0068fccd33d263e342f8d70d329d2ea494290d96..9ffa970c066efe6dc34fd99d6f957dcd21e621f8 100644 --- a/deployment/arima/src/model_predict.py +++ b/deployment/arima/src/model_predict.py @@ -8,11 +8,15 @@ from filelock import FileLock from src.preprocess_dataset import Dataset import time import logging +import pytz +from datetime import datetime pd.options.mode.chained_assignment = None """Script for temporal fusion transformer prediction""" +TZ = os.environ.get("TIME_ZONE", "Europe/Vienna") + def predict( target_column, diff --git a/web-crawler/.project b/web-crawler/.project new file mode 100644 index 0000000000000000000000000000000000000000..7570ca800b964e92865290f0f859858d45b544c1 --- /dev/null +++ b/web-crawler/.project @@ -0,0 +1,17 @@ + + + web-crawler + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/web-crawler/.settings/org.eclipse.core.resources.prefs b/web-crawler/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000000000000000000000000000000000000..dc29be3d889d296e04ae28442666b936e27d4de6 --- /dev/null +++ b/web-crawler/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding/testAPI.py=utf-8 diff --git a/web-crawler/CrawlerDatabase.py b/web-crawler/CrawlerDatabase.py index 47ccfeedfe4506fa83c8dd4c54b742d77dd3880b..36563444f1b6f3825cbe11d6548a1acc9e0039b7 100755 --- a/web-crawler/CrawlerDatabase.py +++ b/web-crawler/CrawlerDatabase.py @@ -120,7 +120,7 @@ class CrawlerDatabase: else: parameters = Utf8Helper.encode_parameters(parameters) cur.execute(sqlString, parameters) - except Exception, e: + except Exception as e: logger.error("SQL = " + sqlString + " PARAMETERS = " + str(parameters)) logger.error(str(e)) if commit: @@ -135,8 +135,8 @@ class Utf8Helper(): this method is used to encode parameters Some chars >=128 end up in the string so I need to remove the to prevent exceptions ''' - for k in parameters.keys(): - if type(parameters[k]) == unicode or type(parameters[k]) == etree._ElementUnicodeResult: + for k in list(parameters.keys()): + if type(parameters[k]) == str or type(parameters[k]) == etree._ElementUnicodeResult: safe_string = StringHelper.removeNonAscii(parameters[k]).encode('utf8') #logger.info("Utf8Helper.encode_parameters has encoded: " + safe_string) parameters[k] = safe_string diff --git a/web-crawler/Integrator.py b/web-crawler/Integrator.py index e517fa08074a641ee1dca74d1b0c3075f4655423..7bd02d4eea5503199e0d7653411488bd19266e3d 100755 --- a/web-crawler/Integrator.py +++ b/web-crawler/Integrator.py @@ -4,22 +4,25 @@ # # Copyright 2014 Bitergium SLL -#February 2021 -#Owner Engineering Ingegneria Informatica S.p.A. -#Update the code for the MORPHEMIC release 1.5 +#Updated on February 2021 for MORPHEMIC project by Maria Antonietta Di Girolamo +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo + from dw_batch import DwBatch -from doap_project import DoapProject, DoapRepository, DoapVersion, FoafPerson +#from doap_project import DoapProject, DoapRepository, DoapVersion, FoafPerson from Utils import Logger, TrivialJSONEncoder, Configuration, DateHelper, Sources, StringList from github_data_fetcher import GithubArchiveStats -from doapfiend.doaplib import load_graph -from doapfiend.model import Project as doap +#from doapfiend.doaplib import load_graph +#from doapfiend.model import Project as doap from CrawlerDatabase import CrawlerDatabase import base64 import site import sys import os, stat import urllib3 -import json2html +#import json2html import requests import decimal import json diff --git a/web-crawler/Orchestrator.py b/web-crawler/Orchestrator.py index f60c1b2e52f32be19c37a6f2db2404e871adc51c..41317653f32b4e1c5468c0b4493d5ed9697f75ff 100755 --- a/web-crawler/Orchestrator.py +++ b/web-crawler/Orchestrator.py @@ -4,20 +4,24 @@ # # Copyright 2014 Bitergium SLL -#February 2021 -#Owner Engineering Ingegneria Informatica S.p.A. -#Update the code for the MORPHEMIC release 1.5 +#Updated on February 2021 for MOPRHEMIC project release 1.0 +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo +#Last updated on July2021 -import time from Integrator import ApacheIntegrator, GithubIntegrator, JQueryPluginIntegrator - from apache_data_fetcher import ApacheDataFetcher from jquery_plugin_data_fetcher import JQueryPluginDataFetcher from Utils import Configuration, Logger, Sources from CrawlerDatabase import CrawlerDatabase -from RepositoryCrawler import RepositoryCrawler +#from RepositoryCrawler import RepositoryCrawler from generic_data_fetcher import GenericDataFetcher from github_data_fetcher import GitHubDataFetcher +#from r_forge_data_fetcher import R_Forge_DataFetcher +import traceback +import time class Orchestrator(): @@ -38,63 +42,50 @@ class Orchestrator(): logger.info("Waking up after " + str(Configuration.sleep_time) + " seconds.") Configuration() if Configuration.exit_now: - logger.info("Exiting Orchestrator as Configuration.exit_now is True.") - break + logger.info("Exiting Orchestrator as Configuration.exit_now is True.") + break def iteration(self): logger = Logger.getInstance() - logger.info("Starting retrieving of the metadata project") ## DEBUG!! + #logger.info("Starting fetch the projects open source") ## DEBUG!! start = time.time() - #gh = GitHubDataFetcher() - #while True: + #rc = RepositoryCrawler() + #rc = RepositoryCrawler() try: - #gh.run() + #rc.run() + #end = time.time() + #print(('Time taken to load github fetcher into crawler db is : ', str(end - start))) + if Configuration.github_every_n_days > 0: - gh = GitHubDataFetcher() - gh.run() - gh.batch.complete() - end = time.time() - print('Time taken to load github fetcher into crawler db is : ', str(end - start)) - #logger.info("Fetching data from GitHubArchive through RepositoryCrawler") - ''' - start = time.time() - ghi = GithubIntegrator() - if hasattr(ghi, 'batch') and not (ghi.batch is None): - ghi.integrate() - ghi.limitBatchLength() - - end = time.time() - print('Time taken to load into MetadataProject table is : ', str(end - start)) - ''' - # APACHE - - if Configuration.apache_every_n_days > 0: + logger.info("dentro githubveryndays") + gh = GitHubDataFetcher() + #logger.info("prima di run githubdatafetcher") + gh.run() + logger.info("dopo github.run") + gh.batch.complete() + end = time.time() + print('Time taken to load github fetcher into crawler db is : ', str(end - start)) + + if Configuration.apache_every_n_days > 0: logger.info("Starting ApacheDataFetcher") adf = ApacheDataFetcher() - #logger.info("ci arrivi qu iprima del run?") adf.run() - adf.batch.complete() - ai = ApacheIntegrator() - if hasattr(ai, 'batch') and not (ai.batch is None): - ai.integrate() - ai.limitBatchLength() - - #JQueryPlugin - if Configuration.jqueryplugin_every_n_days > 0: + #adf.batch.complete() + end = time.time() + print('Time taken to load apache fetcher into crawler db is : ', str(end - start)) + + + if Configuration.jqueryplugin_every_n_days > 0: logger.info("Starting JQueryPluginDataFetcher") - jq = JQueryPluginDataFetcher(Sources.JQueryPlugin) + jq = JQueryPluginDataFetcher() jq.run() jq.batch.complete() - jqi = JQueryPluginIntegrator() - if hasattr(jqi, 'batch') and not (jqi.batch is None): - jqi.integrate() - jqi.limitBatchLength() - except Exception, ex: + except Exception as ex: + traceback.print_exc() #logging.error(str(ex)) - logger.info(str(ex)) - - -Orchestrator() + #logger.info(str(ex)) + +Orchestrator() \ No newline at end of file diff --git a/web-crawler/README b/web-crawler/README index 22404339ac266545434b3a00ab33b977ab8f8928..58e008b3f4e16b490b1304ef10f692872f4f2603 100755 --- a/web-crawler/README +++ b/web-crawler/README @@ -5,7 +5,12 @@ # Copyright 2014 Bitergium SLL MARKOS Crawler has been realized for the MARKOS EU Project (2003-2005) MARKOS Crawler is updated for the MORPHEMIC project by ENG team. -create under hme directory : $HOME/markos/markos02/github -Example: /home/ubuntu/markos/markos02/github + On july 2020 the flossmole forge has been dismissed , so the ENG team -decide to dismiss flossmole from MARKOS Crawler for the MORPHEMIC project. \ No newline at end of file +decide to dismiss flossmole from the Web-Crawler for the MORPHEMIC project. +On March2021 the creation of the doap model is separated from WEb-Crawler and added into a new +service of the MORPHEMIC project (called KnowledgeBase). +On May 2021 dismissed the Rest API developed by Engineering for the management of the communication with the other service of the +Markos project and uploaded with the ElasticSearch client. +New RESTful API used by KnowledgeBase to communicate with the Web-Crawler. +On July2021 Web-Crawler code is updated with a new, stable version of the Python -> 3.9.6 diff --git a/web-crawler/RepositoryCrawler.py b/web-crawler/RepositoryCrawler.py index 5b5c26ec1dddc1d926b8978a68b76d0a13dad214..ce7d4abff5fb437e9151c5c9ad84a8cf3bd5fac6 100755 --- a/web-crawler/RepositoryCrawler.py +++ b/web-crawler/RepositoryCrawler.py @@ -16,18 +16,9 @@ import traceback import time from generic_data_fetcher import GenericDataFetcher -from __builtin__ import False - -from webob import Response, exc,request -from cornice import Service, validators -import uuid -import sys -from pyramid.renderers import render -from webob.request import Request - -sys.path.append('..') - -crawler = Service(name='sendmessage', path='/crawler', description="send message") +from github_data_fetcher import GitHubDataFetcher +from apache_data_fetcher import ApacheDataFetcher +from jquery_plugin_data_fetcher import JQueryPluginDataFetcher class RepositoryCrawler(GenericDataFetcher): ''' @@ -37,232 +28,46 @@ class RepositoryCrawler(GenericDataFetcher): super(RepositoryCrawler, self).__init__(Sources.Github) def run(self): + logger = Logger.getInstance() ''' ''' + try: + start = time.time() + if Configuration.github_every_n_days > 0: + logger.info("dentro githubveryndays") + gh = GitHubDataFetcher() + #logger.info("prima di run githubdatafetcher") + gh.run() + logger.info("dopo github.run") + gh.batch.complete() + end = time.time() + print(('Time taken to load github fetcher into crawler db is : ', str(end - start))) + + if Configuration.apache_every_n_days > 0: + logger.info("Starting ApacheDataFetcher") + adf = ApacheDataFetcher() + df.run() + #adf.batch.complete() + end = time.time() + print(('Time taken to load apache fetcher into crawler db is : ', str(end - start))) + + #JQueryPlugin + if Configuration.jqueryplugin_every_n_days > 0: + logger.info("Starting JQueryPluginDataFetcher") + jq = JQueryPluginDataFetcher() + jq.run() + jq.batch.complete() + print(('Time taken to load apache fetcher into crawler db is : ', str(end - start))) + + sleep_time = Configuration.repository_crawler_sleep_time + logger.info("Repository Crawler about to sleep for " + str(sleep_time) + " seconds.") + time.sleep(sleep_time) + # to avoid 2006 'MySQL server has gone away' issue + CrawlerDatabase.connect() + logger.info("Repository Crawler waking up after " + str(sleep_time) + " seconds.") - months_of_stats = 0 - dt = date.today() - while months_of_stats < Configuration.github_archive_months: - dt1 = dt.replace(day=1) #go to first day of month - dt = dt1 - timedelta(days=1) #back one day so I get previous month - year_minus_cursor = dt.year - month_minus_cursor = dt.month - gas = GithubArchiveStats(year_minus_cursor, month_minus_cursor) - #do I have data for this month - parameters = { - 'stat_year': year_minus_cursor, - 'stat_month': month_minus_cursor - } - if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: - months_of_stats = months_of_stats + 1 - elif gas.filesAvailable(): - gas.fetchFiles() - gas.processFiles() - months_of_stats = months_of_stats + 1 - #process donwloaded file finisched - #Crawler send a message to knowledgebase : ready - #self.send_message() - - - - - -class GithubArchiveStats(): - """ - We fetch statistics from http://www.githubarchive.org/ - an instance is taking care of a specific month in a year - """ - def __init__(self, y, m): - self.y = y - self.m = m - - self.mm = "%02d" % m - #self.idDWBatch = self.batch.id_batch - dt = date.today() - dt1 = dt.replace(day=1) #go to first day of month - dt = dt1 - timedelta(days=31) #add 31 days so I go to next month - self.mm_next_month = "%02d" % dt.month - self.yyyy_next_month = "%02d" % dt.year - - - @staticmethod - def statsAvailable(): - """ - Returns true if there are N months of statistics in the local database out of the last N+1 months - where N = Configuration.github_archive_months - we look back N+1 months because testermonth's statistics will not be ready the during the first days - of the month; so it is ok to have the last N available even if yestermonth is not there - """ - logger = Logger.getInstance() - months_of_stats = 0 - how_many = 0 - date_cursor = date.today() - while months_of_stats <= Configuration.github_archive_months: - dt1 = date_cursor.replace(day=1) #go to first day of month - date_cursor = dt1 - timedelta(days=1) #back one day so I get previous month - year_minus_cursor = date_cursor.year - month_minus_cursor = date_cursor.month - #do I have data for this month - parameters = { - 'stat_year': year_minus_cursor, - 'stat_month': month_minus_cursor - } - - if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: - how_many = how_many + 1 - months_of_stats = months_of_stats + 1 - logger.debug("GithubArchiveStats.statsAvailable: Do we have any stats to process?" + str(how_many >= Configuration.github_archive_months)) ## DEBUG - return how_many >= Configuration.github_archive_months - - - def filesAvailable(self): - """ - Are files available at http://data.githubarchive.org - I assume all files for a month are available if first file of next month is available - """ - proc = Popen("wget http://data.githubarchive.org/" + self.yyyy_next_month + "-" + self.mm_next_month + "-01-0.json.gz", shell=True, cwd=Configuration.temporary_directory) - return_code = proc.wait() - if return_code == 8: - return False - return True - - def fetchFiles(self): - """ - Files are per hour with name: YEAR-MONTH-DAY-HOUR.json.gz - """ - logger = Logger.getInstance() - #Pavia: for day_iter in range(1, monthrange(self.y, self.m)[1] + 1): #number of days in this month - for day_iter in range(1, 3): #number of days in this month - - #Pavia: for hour_iter in range(24): - for hour_iter in range(10, 12): - sz_day = "%02d" % day_iter - sz_hour = str(hour_iter) - - if not os.path.isfile(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz"): - proc = Popen("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz", shell=True, cwd=Configuration.github_file_path + "/gh") - return_code = proc.wait() - if return_code == 8: - logger.error("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz" + " returned error code 8") - - - - - def processFiles(self): - logger = Logger.getInstance() - - compressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json.gz") - for compressed_file in compressed_files: - proc = Popen("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh") - return_code = proc.wait() - - uncompressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json") - for uncompressed_file in uncompressed_files: - with open(uncompressed_file) as f: - content = f.readlines() - for line in content: - try: - decoded = json.loads(line) - # GistEvent lines have no repository - if decoded["type"] != "GistEvent" : #not interested in Gists - #To speed up testing restrict to ReleaseEvent - #if decoded["type"] == "ReleaseEvent": - repo = decoded["repo"] - - logger.debug("Parsing event type: " + decoded["type"] + " from project: " + repo["name"]) - try: - if decoded["type"] == "RepositoryEvent" and ( decoded["action"] == "created" or decoded["action"] == "edited" or decoded["action"] == "renamed" ): - try: - project_description = decoded["description"] - logger.debug("Found description:" + project_description + " for project: " + repo["name"]) - except: - project_description = "" - else: - project_description = "" - - #print("DEBUG!! processfiles ") - #time.sleep(1.5) - ''' - if decoded["type"] == "PullRequestEvent" : - payload = decoded["payload"] - pull_request = payload["pull_request"] - deployments_url = pull_request["deployments_url"] - license = pull_request["license"] - language = pull_request["language"] - logger.debug("deploy " + deployment_url + " license " + license + " language " + language) - ''' - #Pavia: in questo pezzo di codice incrementa gli eventi relativi ad un progetto gia' conosciuto per il periodo preso in considerazione nelle statistiche - #se in precedenza abbiamo trovato una descrizione del progetto aggiorna il relativo campo - parameters = { - 'project_name': str(repo["name"]), - 'description': str(project_description), - 'stat_year': self.y, - 'stat_month': self.m, - } - #print "parameters" - #print(str(parameters)) - #print "str(idDWBatch)" - #print str(self.idDWBatch) - #print("SELECT count(*) from rc_gh_archive WHERE project_name = '" + str(repo["name"]) + "' AND idGhProject >0 ") - #if CrawlerDatabase.select_int("SELECT count(*) from rc_gh_archive WHERE project_name = '" + str(repo["name"]) + "' AND idGhProject >0 ") > 0 : - #Mria February 2021 limitiamo il dowmload dei progetti da GitHub solo per ReleaseEvent - if decoded["type"] == "ReleaseEvent": - - if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", parameters) > 0: - if parameters['description'] == "": #if description is empty I do not overwrite it as it might have been there in other events - CrawlerDatabase.execute("UPDATE rc_gh_archive SET event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", False, parameters) - else: - CrawlerDatabase.execute("UPDATE rc_gh_archive SET description=%(description)s, event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", False, parameters) - else: - #Maria febbratio 2021 : Insert non funziona sull achiave primaria project_name quando - #con diversa release developername - #USiamo inert in diverso modo : INSERT INTO ins_duplicate VALUES (4,'Gorilla') ON DUPLICATE KEY UPDATE animal='Gorilla'; - #CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s) ON DUPLICATE KEY UPDATE project_name=%(project_name)s", False, parameters) - CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s)", True, parameters) - - #Pavia: se l'evento e' di tipo ReleaseEvent, qui parsiamo le informazioni necessarie per popolare la rc_gh_archive_release - #Commento di Maria February 2021 al momento prendiamo solo progetti della ReleaseEvent - #limitiamo il donwload dei progetti da GitHub - #if decoded["type"] == "ReleaseEvent": - #Pavia: l'"url" e l'"id" della release ora li troviamo sotto payload->release - payload = decoded["payload"] - release = payload["release"] - developer_name = '' - developer_name = repo["name"].rsplit("/",1)[0] - #print developer_name - parameters = { - 'project_name': str(repo["name"]), - 'developer_name': str(repo["name"].rsplit("/",1)[0]), - 'url': str(release["url"]), - 'version': str(release["id"]) - } - - #print("Found release event for project: " + repo["name"] + ", release id: " + str(release["id"]) + ", release url: " + release["url"]) - #print ("SELECT count(*) FROM rc_gh_archive_release WHERE project_name = " + repo["name"] + " AND version = " + release["id"]+ " AND developer_name=" + developer_name) - #sqlstring = "SELECT count(*) FROM rc_gh_archive_release a , rc_gh_archive b WHERE a.project_name = '" + repo["name"] + "'" + "AND a.project_name = b.project_name " - #sqlstring = sqlstring + " AND a.version ='" + str(release['id']) + "' AND a.developer_name = '" + str(repo["name"].rsplit("/",1)[0]) + "'" - #print "=======================================================" - #print sqlstring - #print "=======================================================" - - #if CrawlerDatabase.select_int("SELECT count(*) FROM rc_gh_archive_release WHERE project_name = '" + repo["name"] + "' AND version = '" + release["id"]+ "' AND developer_name='" + developer_name + "'") == 0: - # CrawlerDatabase.execute("UPDATE rc_gh_archive_release SET project_name=%(project_name)s AND version=%(version)s AND developer_name = %(developer_name)s AND url=%(url)s", False, parameters) - #else: - #try: - # CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version, developer_name) VALUES (%(project_name)s, %(url)s, %(version)s, %(developer_name)s) ", True, parameters) - #except: - # logger.error("Error INSERTING INTO rc_gh_archive_release: " + str(parameters)) - #else: - #logger.info("WARNING: The github project " + str(repo["name"]) + " with release " + str(release["id"]) + " for the developer name " + str(repo["name"].rsplit("/",1)[0]) + " exist.") - - logger.debug("Found release event for project: " + repo["name"] + ", release id: " + str(release["id"]) + ", release url: " + release["url"]) - CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version, developer_name) VALUES (%(project_name)s, %(url)s, %(version)s, %(developer_name)s) ON DUPLICATE KEY UPDATE url = %(url)s, version=%(version)s, developer_name=%(developer_name)s", True, parameters) - except Exception, ex: - logger.error(traceback.format_exc()) - - #logger.error(str(ex)) - # do nothing; sometimes repository is missing - - except Exception, ex: - logger.info(str(ex) + " missing in " + line) + except Exception as ex: + traceback.print_exc() + time.sleep(5) + #logger.error(str(ex)) diff --git a/web-crawler/RepositoryCrawlerThread.py b/web-crawler/RepositoryCrawlerThread.py index f1527284e6319969bd385c192dd976f6fd0cfc04..014f7e67bef7a15b825337698904b15aeb3c5b02 100755 --- a/web-crawler/RepositoryCrawlerThread.py +++ b/web-crawler/RepositoryCrawlerThread.py @@ -18,12 +18,11 @@ logger = Logger.getInstance() while True: try: - if Configuration.github_every_n_days > 0: - logger.info("Fetching data from GitHubArchive through RepositoryCrawler") - #before to integrate in the MetadataProject , the crawler should be send to the Analyser - #that metadataproject is ready ... - gh.run() - gh.batch.complete() + #logger.info("Fetching data from GitHubArchive through RepositoryCrawler") + #before to integrate in the MetadataProject , the crawler should be send to the Analyser + #that metadataproject is ready ... + gh.run() + gh.batch.complete() sleep_time = Configuration.repository_crawler_sleep_time logger.info("Repository Crawler about to sleep for " + str(sleep_time) + " seconds.") diff --git a/web-crawler/Utils.py b/web-crawler/Utils.py index c515375190813ed405e2703ffef9f0961827adc5..8888341253616b427ce005deb6310255e84651a4 100755 --- a/web-crawler/Utils.py +++ b/web-crawler/Utils.py @@ -4,17 +4,26 @@ # # Copyright 2014 Bitergium SLL +#Updated on April 2020 for MOPRHEMIC project +#Last updated on July 2021 +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo import logging import logging.handlers import base64 import datetime -import ConfigParser +#import ConfigParser +import configparser import json import os import sys from json import JSONEncoder from datetime import date -import urllib2 +#import urllib2 +import urllib.request, urllib.parse, urllib.error +from urllib import request class Logger(): @@ -91,35 +100,35 @@ class Logger(): def debug(self,msg): if self.__stdout and self.__logLevel<=0: - print ('DEBUG: '+ msg) + print(('DEBUG: '+ msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.getLogger().debug(msg) def error(self,msg): if self.__stdout and self.__logLevel<=3: - print ('ERROR: '+ msg) + print(('ERROR: '+ msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.error(msg) def info(self,msg): if self.__stdout and self.__logLevel<=1: - print ('INFO: '+msg) + print(('INFO: '+msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.getLogger( ).info(msg) def critical(self,msg): if self.__stdout and self.__logLevel<=5: - print ('CRITICAL: '+msg) + print(('CRITICAL: '+msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.critical(msg) def fatal(self,msg): if self.__stdout and self.__logLevel<=4: - print ('FATAL: '+msg) + print(('FATAL: '+msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.fatal(msg) @@ -127,7 +136,7 @@ class Logger(): def warn(self,msg): if self.__stdout and self.__logLevel<=2: - print ('WARN: '+msg) + print(('WARN: '+msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.warn(msg) @@ -135,7 +144,7 @@ class Logger(): def warning(self,msg): if self.__stdout and self.__logLevel<=2: - print ('WARNING: '+msg) + print(('WARNING: '+msg)) #BUG logger does not remember its level; it remembers its file path though #logging.getLogger( ).setLevel(Configuration.logging_level) logging.warning(msg) @@ -171,7 +180,7 @@ class DateHelper(): class StringHelper(): @staticmethod def removeNonAscii(thisString): - return "".join(filter(lambda x: ord(x)<128, thisString)) + return "".join([x for x in thisString if ord(x)<128]) @staticmethod def makeUnicodeSafe(thisString): ''' @@ -179,7 +188,7 @@ class StringHelper(): ''' while True: try: - return unicode(thisString) + return str(thisString) except UnicodeDecodeError as ex: #UnicodeDecodeError thisString = thisString[0:ex.start] + thisString[ex.end:] @@ -215,8 +224,8 @@ class Configuration(): #CAaddress = '' #CAport = '' - KAaddress = '' - KAport = '' + #KAaddress = '' + #KAport = '' MySQLhost = '' MySQLuser = '' @@ -243,18 +252,22 @@ class Configuration(): exit_now = False sf_file_path = '' stdout = False + githubUser = '' + githubToken = '' def __init__(self): - Config = ConfigParser.ConfigParser() - + #for python2.7 Config = ConfigParser.ConfigParser() + #for python3.x + Config = configparser.ConfigParser() Config.read(os.path.dirname(os.path.realpath(__file__)) + "/config") #file 'config' is in the same folder #Configuration.CAaddress = Config.get("CodeAnalyser", "ip_address") #Configuration.CAport = Config.get("CodeAnalyser", "port") - Configuration.KAaddress = Config.get("Knowledgebase","ip_address") - Configuration.KAport = Config.get("Knowledgebase","port") - + #Configuration.KAaddress = Config.get("Knowledgebase","ip_address") + #Configuration.KAport = Config.get("Knowledgebase","port") + Configuration.githubUser = Config.get("GitOAuth","user") + Configuration.githubToken = Config.get("GitOAuth","token") Configuration.MySQLhost = Config.get("Database", "MySQLhost") Configuration.MySQLuser = Config.get("Database", "MySQLuser") Configuration.MySQLpasswd = Config.get("Database", "MySQLpasswd") @@ -275,13 +288,11 @@ class Configuration(): #Configuration.sf_file_path = Config.get("General", "sf_file_path") # path to sourceforge files to be imported - #Configuration.flossmole_file_path = Config.get("General", "flossmole_file_path") # path to flossmole files to be imported #Ferbuary 2021 Now we have github_file_path Configuration.github_file_path = Config.get("General", "github_file_path") - Configuration.temporary_directory = Config.get("General", "temporary_directory") # path to system temporary directory @@ -292,8 +303,8 @@ class Configuration(): Configuration.log_path = Config.get("Logging", "path") Configuration.stdout = Config.getboolean("Logging", "stdout") Configuration.path_html = Config.get("Logging", "path_html") - # path to html files where monitoring information is written + # path to html files where monitoring information is written Configuration.apache_every_n_days = Config.getint("Fetchers", "apache_every_n_days") Configuration.github_every_n_days = Config.getint("Fetchers","github_every_n_days") #Configuration.codeplex_every_n_days = Config.getint("Fetchers", "codeplex_every_n_days") @@ -312,7 +323,7 @@ class Configuration(): Configuration.sf_updated_days = Config.getint("RepositoryCrawler", "sf_updated_days") #[KnowledgeBase] configuration: - Configuration.web_path_html = Config.get("Knowledgebase","web_path_html") + #Configuration.web_path_html = Config.get("Knowledgebase","web_path_html") class TrivialJSONEncoder(JSONEncoder): def default(self, o): @@ -368,7 +379,10 @@ class UrllibHelper(): ret = "" while not (success or n_attempts>max_attempts): try: - response = urllib2.urlopen(url_string) + #for python2,7 response = urllib2.urlopen(url_string) + #for python3.x: + response = urllib.encode(url_string) + ret = response.read() success = True except Exception as ex: diff --git a/web-crawler/apache_data_fetcher.py b/web-crawler/apache_data_fetcher.py index ad66dd5a5e58ce7989457af37d62ee4136c7a3be..766ffc93ad1364face5145cf60e6612cef3e5bf9 100755 --- a/web-crawler/apache_data_fetcher.py +++ b/web-crawler/apache_data_fetcher.py @@ -1,59 +1,396 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -# # Copyright 2014 Bitergium SLL -# Author: Davide Galletti +##Updated on July 2021 for MORPHEMIC project by Maria Antonietta Di Girolamo +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo +import urllib.request, urllib.error, urllib.parse +import xml.dom.minidom +import xml.etree.ElementTree as ET +import base64 +import elasticsearch +import requests +import traceback +import json, time, ast +import csv +import datetime +import subprocess +import glob +import datetime +import ast +from subprocess import Popen -from doapfiend.doaplib import load_graph -from generic_data_fetcher import GenericDataFetcher -import urllib2 from xml.dom import minidom +from generic_data_fetcher import GenericDataFetcher from CrawlerDatabase import CrawlerDatabase -import base64 -from Utils import Logger, Sources +from time import sleep +from Utils import Logger, Configuration, States, Sources, StringList, StringHelper, TrivialJSONEncoder, DateHelper +from xml import etree +from datetime import datetime +from elasticsearch import exceptions, Elasticsearch,helpers + +start_time = time.time() +# declare globals for the Elasticsearch client host +DOMAIN = "localhost" +PORT = 9200 +INDEX = "knowbase" +# concatenate a string for the client's host paramater +host = str(DOMAIN) + ":" + str(PORT) #+ "/values-knowdata" + +#es_client used to create the knowldgebase dopp-model +es_client = Elasticsearch([{'host':str(DOMAIN), 'port':str(PORT)}]) + class ApacheDataFetcher(GenericDataFetcher): def __init__(self): super(ApacheDataFetcher, self).__init__(Sources.Apache) + + if (es_client.ping()): + print("Yeah KnowledgeBase is up and running!") + else: + print("Ops it could not up and running!") + + + ''' + generator to push bulk data from a JSON + file into an Elasticsearch index + ''' + def bulk_json_data(self, knowledgebase, _index, _id): + + #json_list = knowledgebase + #for doc in json_list: + # use a `yield` generator so that the data + # isn't loaded into memory + # print doc + #if '{"index"' not in doc: + yield { + "_index": _index, + "_id": _id, + "_source": knowledgebase + } + + def run(self): logger = Logger.getInstance() - #reading the list of doap files - #Maria file.xml don't exist response = urllib2.urlopen('https://svn.apache.org/repos/asf/infrastructure/site-tools/trunk/projects/files.xml') - response = urllib2.urlopen('https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/data/projects.xml') - + data = urllib.request.urlopen("https://projects.apache.org/json/foundation/projects.json") + jsonData = data.read() + json_dictionary = json.loads(jsonData) + #print("Loading ..............") + #print("json dictionary is ") + #print(json_dictionary) - #print "after reading list of doap files " - xml = response.read() - xmldoc = xml.dom.minidom.parseString(xml) - itemlist = xmldoc.getElementsByTagName('location') - # reading - for s in itemlist : - print s.firstChild.data - try: - # getting the rdf-xml from the url - doapurl = urllib2.urlopen(s.firstChild.data) - xml = doapurl.read() - # parsing the rdf - doap = load_graph(xml) - # creating ApacheDoap also saves it to the RAW database - ApacheDoap(doap, xml, self.batch.id_batch) - logger.info("Read " + doap.name + " from " + s.firstChild.data) - except Exception as e: - logger.error('Error loading doap: ' + s.firstChild.data) - logger.error(str(e)) - + i = 0 + j = 0 + query_all = {'size':10000, 'query': {'match_all': {}}} + # get a response using the Search API + res = es_client.search(index="knowbase", body=query_all) + #res = es_client.search(index="knowbase", body={"query": {"match_all": {}}}) -class ApacheDoap(): - def __init__(self, doap, xml, idBatch): - #Maria da errore riformulo l'istruzione CrawlerDatabase.execute("INSERT into RAW_Apache_Project (name, homepage, doapfile, iddwbatch) VALUES ('" + doap.name + "', '" + str(doap.homepage) + "', '" + base64.b64encode(xml) + "', " + str(idBatch) + ")", True) -#'rdfSubject('')' - #le date nel file xml sono a volte caricate in modo incorretto o incompleto , per questo motivo - #decido al momento di inserire IGNORE nell'istruzione di inserimento - #questo pero dovra essere modificato e gestito - #MAria str(doap.homepage) riporta errore MySQL sostituisco con un replace della stringa - #CrawlerDatabase.execute("INSERT into RAW_Apache_Project (Name, Homepage, DoapFile, idDWBatch) VALUES ('" + doap.name + "', '"+ str(doap.homepage).replace("\'","\"") + "', '" + base64.b64encode(xml) + "', " + str(idBatch) + ")", True) - logger.info("qui no ") - CrawlerDatabase.execute("INSERT into RAW_Apache_Project (Name, Homepage, DoapFile, idDWBatch) VALUES ('" + doap.name + "', '"+ str(doap.homepage).replace("\'","\"") + "', '" + base64.b64encode(xml) + "', " + str(idBatch) + ")", True) - \ No newline at end of file + for hit in res['hits']['hits']: + print("hit if _id") + if (hit['_id'] == i): + #print((hit['_id'])) + j = i + 1 + else: + j = i + #print(("j ",j)) + i = j + #print(("now i is ",i)) + current_project_name = "" + try: + for key in json_dictionary: + #print("=====================") + #time.sleep(5) + label = [] + release = [] + programming_language = [] + category = [] + repositoryUrl = "" + latest = 0 + projectName = str(json_dictionary[key]['name']) + rdfFile = str(json_dictionary[key]['doap']) + try: + #print("prima di category") + category.append(str(json_dictionary[key]['category'])) + #try: + #print("json_dictionary of key is ",json_dictionary[key]) + #time.sleep(5) + if ('release' in json_dictionary[key]): + for r in json_dictionary[key]['release']: + if r["revision"] > str(latest): + latest = r["revision"] + name = json_dictionary[key]["name"] + created = r["created"] + release = {"revision": latest, + "name" : name, + "created" : created + } + #print(latest) + + try: + created = str(json_dictionary[key]['created']) + + except: + created = None + try: + repositoryUrl = json_dictionary[key]['repository'] + except: + repositoryUrl = None + try: + download_page = str(json_dictionary[key]['download-page']) + except: + download_page = "" + try: + homeUrl = str(json_dictionary[key]['homepage']) + except: + homeUrl = '' + try: + #programming_language.append(str(json_dictionary[key]['programming-language'])) + programming_language = str(json_dictionary[key]['programming-language']) + #programming_language = StringList().load_plain(programming_language).base64_encoded + #print("progra lang") + #print(programming_language) + + except: + programming_language = [] + try: + description=str(json_dictionary[key]['description']) + except: + description = "" + + try: + #print("dentro try di kb, release ") + #print(latest) + # StringList().load_plain(self.programming_language).base64_encoded, + parameters = { + 'project' : projectName, + 'description' : description.replace("\n", ""), + 'homepage' : homeUrl, + 'version' : str(latest), + 'downloadpage' : download_page.encode('utf-8'), + 'programminglanguage':programming_language, + 'rdfFile' : rdfFile, + 'repo':repositoryUrl + } + #print("PARAMETERS") + #print(str(parameters)) + knowledgebase = { + "projectName": projectName, + "description": description.replace("\n", "") + } + #print("KB") + #print(str(knowledgebase)) + sqlQueryString = "SELECT version FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(projectName) + sqlQueryString = sqlQueryString + "' AND version= '" + latest + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(homeUrl) + sqlQueryString = sqlQueryString + "'" + logger.info(sqlQueryString) + rel = CrawlerDatabase.select_natural(sqlQueryString) + if rel !=None: + #if a new release exists then update else nothing to do + #time.sleep(5) + try: + #sqlQueryString = "SELECT updated FROM MetadataProject " + #sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(projectName) + #sqlQueryString = sqlQueryString + "' AND version= '" + latest + #sqlQueryString = sqlQueryString + "' AND homepage = '" + homeUrl + #sqlQueryString = sqlQueryString + "'" + #logger.info(sqlQueryString) + #updated = CrawlerDatabase.select_int(sqlQueryString) + #logger.info ("updated is " + str(updated)) + #if (updated == 1): + #description = description.replace("/"," ") + for repo in repositoryUrl: + repositoryUrl = repo + sqlUpdate = "UPDATE MetadataProject SET project_name = '"+ str(projectName) + "'," + if (len(description)>0): + sqlUpdate = sqlUpdate + " description = '"+ str(description.replace("',/","")) + "', " + sqlUpdate = sqlUpdate + " homepage = '" + str(json_dictionary[key]["homepage"]) + "', version='" + latest + "', " + sqlUpdate = sqlUpdate + " downloadpage = '" + str(download_page) + "', programming_language='" + str(programming_language) + sqlUpdate = sqlUpdate + "', updated = 1, typeRepo='Apache' " + sqlUpdate = sqlUpdate + " , repositoryUrl='" + str(repositoryUrl) + "'" + sqlUpdate = sqlUpdate + " WHERE project_name = '" + str(projectName) + "'" + #print(sqlUpdate) + #time.sleep(5) + #print("here send email with the messsage tht a project is updated with a new version") + + try: + CrawlerDatabase.execute(sqlUpdate,True) + logger.info("UPDATE SUCCESS ") + #self.elasitc_send_data() + #time.sleep(3) + sqlQueryString = "SELECT project_name,description, version,typeRepo,homepage, doapFile,repositoryUrl FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(json_dictionary[key]["name"]) + sqlQueryString = sqlQueryString + "' AND version= '" + latest + "'" + #sqlQueryString = sqlQueryString + "' AND homepage = '" + str(json_dictionary[key]['homepage']) + "' AND doapFile = '" + str(json_dictionary[key]['doap']) + "'" + #sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "'" + logger.info(sqlQueryString) + cursor = CrawlerDatabase.execute_cursor(sqlQueryString) + results = cursor.fetchall() + #print("stiamo per entrare qui") + for record in results: + print("record is ", str(record)) + + doc = { + "projectName": str(record[0]), + "description": str(record[1]), + "created": created, # format: 2006-03-27 + "downloadPage": str(download_page), + "homepage": str(record[4]), + "programmingLanguage": programming_language, + "release":{'url':"", + 'rel':{ + "revision":str(record[2]), + "name":str(record[3]), + "created":created + } + + }, + "serviceEndpoint": "", + "typeRepo":str(record[3]), + "topic":category, + "label": label, + "licenseUrl": str(json_dictionary[key]['license']), + "repositoryUrl" : str(record[6]) + } + #print(knowledgebase) + if not (es_client.indices.exists('knowbase')): + print("Index doesn't exist") + res = es_client.create(index='knowbase',body=doc,id=i) + ''' + else: + + print("Loading ..............") + # get a response using the Search API + #res = es_client.search(index="values-knowdata", body=query_all) + res = es_client.index(index='knowbase',body=knowledgebase,id=i) + i = i + 1 + print("res is ",res) + ''' + ''' + if not (es_client.indices.exists('knowbase')) : + es_client.indices.create(index='knowbase',body=knowledgebase,id=i) + ''' + else: + logger.info("Loading metadata information project into KnowledgeBase..............") + + # Check the number of documents in your index + c = es_client.count(index="knowbase") + #print(c['count']) + i = c['count'] + 1 + #print(es_client.count(index="knowbase")) + res = es_client.index(index='knowbase',body=knowledgebase,id=i) + #helpers.bulk(es_client, self.bulk_json_data(doc, "knowbase", i),refresh=True) + i = i + 1 + + except: + traceback.print_exc() + except Exception as ex: + traceback.print_exc() + else: + #logger.info("a questo punto inseriamo il nostro project e flag updated a 0 , i dati sono pronti per essere spediti") + sqlInsertMetadataProject = [] + sqlParamMetadataProject = [] + sqlStringMetadataProject = "" + #se il progetto non esiste allora fevo inserirlo e metto update a 0 + #qui fai insert coi dati nuovi e metti a 1 il flag updated + sqlInsertMetadataProject.insert(0,"INSERT INTO MetadataProject (project_name,") + sqlParamMetadataProject.insert(0,"VALUES (%(project)s,") + sqlInsertMetadataProject.extend(" homepage,version, downloadpage,updated, typeRepo,programming_language,repositoryUrl ) ") + sqlParamMetadataProject.extend(" %(homepage)s, %(version)s, %(downloadpage)s, 0, 'Apache',%(programminglanguage)s,%(repo)s )"); + #sqlInsertMetadataProject.extend(" VALUES ('" + str(json_dictionary[key]['homepage']) + "','" + str(json_dictionary[key]['name']) + "','" + str(description) + "','" + str(latest) + "','" + str(download_page)+ "', '0', 'Apache','" + str(programming_language) + "')") + sqlStringMetadataProject = ''.join (sqlInsertMetadataProject) +''.join (sqlParamMetadataProject) + #logger.info("I'M GOING TO INSERT NEW METADATA PROJECT INFORMATION") + #logger.info(sqlStringMetadataProject) + + try: + CrawlerDatabase.execute(sqlStringMetadataProject, True,parameters) + logger.info("INSERT SUCCESS") + #logger.info("DOPO elastic send data") + #read my metadataproject + sqlQueryString = "SELECT count(*) FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(json_dictionary[key]['name']) + sqlQueryString = sqlQueryString + "' AND version= '" + latest + sqlQueryString = sqlQueryString + "' AND homepage = '" + json_dictionary[key]['homepage'] + "'" + #logger.info("SELECT FROM MetadataProject") + logger.info(sqlQueryString) + #time.sleep(3) + if (CrawlerDatabase.select_int(sqlQueryString) > 0): + sqlQueryString = "SELECT project_name,description, version,typeRepo,homepage, doapFile,repositoryUrl FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(json_dictionary[key]["name"]) + sqlQueryString = sqlQueryString + "' AND version= '" + latest + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(json_dictionary[key]['homepage']) + "' AND doapFile = '" + str(json_dictionary[key]['doap']) + "'" + #sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "'" + logger.info(sqlQueryString) + cursor = CrawlerDatabase.execute_cursor(sqlQueryString) + results = cursor.fetchall() + #print("stiamo per entrare qui") + for record in results: + #print("record is ", str(record)) + + doc = { + "projectName": str(record[0]), + "description": str(record[1]), + "created": created, # format: 2006-03-27 + "downloadPage": str(download_page), + "homepage": str(record[4]), + "programmingLanguage": programming_language, + "release":{'url':"", + 'rel':{ + "revision":str(record[2]), + "name":str(record[3]), + "created":created + } + + }, + "serviceEndpoint": "", + "typeRepo":str(record[3]), + "topic":category, + "label": label, + "licenseUrl": str(json_dictionary[key]['license']), + "repositoryUrl" : str(record[6]) + } + #print(i) + if not (es_client.indices.exists('knowbase')) : + res = es_client.indices.create(index='knowbase',body=doc,id=i) + + else: + logger.info("Loading metadata information project into KnowledgeBase..............") + + # Check the number of documents in your index + c = es_client.count(index="knowbase") + #print(c['count']) + i = c['count'] + 1 + #print(es_client.count(index="values-knowdata")) + res = es_client.index(index='knowbase',body=knowledgebase,id=i) + #helpers.bulk(es_client, self.bulk_json_data(doc, "knowbase", i),refresh=True) + i = i + 1 + #time.sleep(10) + + except: + logger.info("ERROR TO INSERT NEW METADATA PROJECT INFORMATION") + # printing stack trace + traceback.print_exc() + + else: + logger.info("No metadata Apache project in MetadaProject ") + pass + except: + traceback.print_exc() + else: + pass + else: + print("Version for this project is not provided . This project is discarded because is not valid for Moprhemic") + + except: + print("Topic for this project is not provided . this project is discarded") + traceback.print_exc() + + except Exception as ex: + traceback.print_exc() + #pass diff --git a/web-crawler/config b/web-crawler/config index 6e2efa777115823c2f206a15566a4c184d31e6c0..ba7bff5246ab35b5ae2a9b3ad95f1db3d76a8bea 100755 --- a/web-crawler/config +++ b/web-crawler/config @@ -1,31 +1,30 @@ [General] #The main thread waits sleep_timeseconds after each loop. -sleep_time=60 +sleep_time=30 #The notifier thread waits notifier_sleep_time seconds after eachloop. notifier_sleep_time=30 -#ENG marzo 2020 repository_crawler_sleep_time=30060 -repository_crawler_sleep_time = 60 +#ENG marzo 2020 +repository_crawler_sleep_time=30060 +#repository_crawler_sleep_time = 30 - - -#da scommentare prima di eseguire commit sf_file_path=/home/ubuntu/markos/markos02/data-for-doap-work github_file_path = /home/maria/markos/markos02/github temporary_directory=/home/maria/markos/markos02/tmp exit_now=False - +[GitOAuth] +user = xxxx +token = xxxxxxxx [Fetchers] - # a negative number (e.g. -1) disables the source # AAAA_every_n_days < 0 ==> AAAA data won't be crawled # For every forge there a configuration line which tells how many days #will the crawler wait before fetching data again from that forge; -#a negative number (e.g.-1) disables the source. -github_every_n_days=1 +#a negative number (e.g.-1) disa-bles the source. apache_every_n_days=-1 -jqueryplugin_every_n_days=-1 - +github_every_n_days=-1 +jqueryplugin_every_n_days=1 +rforge_every_n_days=-1 [RepositoryCrawler] # how many months of stats is the minimum I should crawl from githubarchive before integrating github projects @@ -51,12 +50,12 @@ path_html=/var/www/html/crawler/www path=/home/maria/markos/markos02/logcrawler.txt #Configurations of the KnowledgeBase -[Knowledgebase] -web_path_html = /var/www/html/knowledgebase +#[Knowledgebase] +#web_path_html = /var/www/html/knowledgebase # ip address and port of knowledgebase, format it like this: 192.168.178.39:8080 -ip_address=127.0.0.1 -port=6543 - +#ip_address=127.0.0.1 +#port=6543 +#port=9200 [Database] diff --git a/web-crawler/database/createCrawlerDB.sql b/web-crawler/database/createCrawlerDB.sql index ee12a26000e0ab883f9d1c7b09b914fa922df7a3..ceea640faa8d22ba2e175f7f01a82e699dd49bf0 100644 --- a/web-crawler/database/createCrawlerDB.sql +++ b/web-crawler/database/createCrawlerDB.sql @@ -63,22 +63,39 @@ CREATE TABLE IF NOT EXISTS `markosN2_db2`.`DWBatch` ( CONSTRAINT `idSource` FOREIGN KEY (`idSource` ) REFERENCES `markosN2_db2`.`Source` (`idSource` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION, + ON DELETE CASCADE + ON UPDATE CASCADE, CONSTRAINT `DWBatch_idFWState` FOREIGN KEY (`idWFState` ) REFERENCES `markosN2_db2`.`wfState` (`idwfState` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION, + ON DELETE CASCADE + ON UPDATE CASCADE, CONSTRAINT `FK_DWBatch_idParent` FOREIGN KEY (`idParent` ) REFERENCES `markosN2_db2`.`DWBatch` (`idDWBatch` ) - ON DELETE NO ACTION - ON UPDATE NO ACTION) + ON DELETE CASCADE + ON UPDATE CASCADE) ENGINE = InnoDB COMMENT = 'Batches of info about projects; is a wf entity'; +ALTER TABLE `markosN2_db2`.`DWBatch`( + ADD CONSTRAINT `idSource` + FOREIGN KEY (`idSource` ) + REFERENCES `markosN2_db2`.`Source` (`idSource` ) + ON DELETE CASCADE + ON UPDATE CASCADE, + ADD CONSTRAINT `DWBatch_idFWState` + FOREIGN KEY (`idWFState` ) + REFERENCES `markosN2_db2`.`wfState` (`idwfState` ) + ON DELETE CASCADE + ON UPDATE CASCADE, + ADD CONSTRAINT `FK_DWBatch_idParent` + FOREIGN KEY (`idParent` ) + REFERENCES `markosN2_db2`.`DWBatch` (`idDWBatch` ) + ON DELETE CASCADE + ON UPDATE CASCADE +) -- ----------------------------------------------------- -- Table `markosN2_db2`.`Project` -- ----------------------------------------------------- diff --git a/web-crawler/database/createMetadataProject.sql b/web-crawler/database/createMetadataProject.sql new file mode 100644 index 0000000000000000000000000000000000000000..890465a88f4ce3f5f155d64b1300319a98f36d85 --- /dev/null +++ b/web-crawler/database/createMetadataProject.sql @@ -0,0 +1,26 @@ + +CREATE TABLE IF NOT EXISTS `MetadataProject` ( + `idProject` INT NOT NULL AUTO_INCREMENT , + `project_name` VARCHAR(255) NULL , + `description` VARCHAR(4000) NULL , + `homepage` VARCHAR(255) NULL , + `created` DATE NULL , + `versionurl` VARCHAR(255) NULL , + `version` VARCHAR(255) NULL , + `downloadpage` VARCHAR(255) NULL , + `updated` BOOLEAN , + `developer_name` VARCHAR(40) NULL, + `programming_language` VARCHAR(2555) NULL , + `serviceEndpoint` TEXT NULL , + `label` VARCHAR(4000) NULL , + `typeRepo` TEXT NULL , + `topic` VARCHAR(4000) NULL , + `licenseUrl` TEXT NULL , + `repositoryUrl` TEXT NULL , + PRIMARY KEY (`idProject`) ) +ENGINE = InnoDB +DEFAULT CHARACTER SET = utf8 +COLLATE = utf8_swedish_ci +COMMENT = 'utf8_swedish_ci due to the fact that this collation is used in Flossmole table'; + + diff --git a/web-crawler/doap_project.py b/web-crawler/doap_project.py index df9a9b6f107b9229f340283adb71a1cded6f6e7f..508af6034f2aba1d92f6b4103692439cfb111315 100755 --- a/web-crawler/doap_project.py +++ b/web-crawler/doap_project.py @@ -4,15 +4,37 @@ # # Copyright 2014 Bitergium SLL +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo + from CrawlerDatabase import CrawlerDatabase from Utils import Logger, StringList, StringHelper, TrivialJSONEncoder, DateHelper import operator import sys from xml.dom import minidom -from _mysql import NULL -from asn1crypto._ffi import null +import traceback + +# import the Elasticsearch client library +from elasticsearch import exceptions, Elasticsearch + +# import JSON and time +import time +# create a timestamp using the time() method +start_time = time.time() + +# declare globals for the Elasticsearch client host +DOMAIN = "localhost" +PORT = 9200 +INDEX = "values-knowdata" +# concatenate a string for the client's host paramater +host = str(DOMAIN) + ":" + str(PORT) #+ "/values-knowdata" +#es_client used to create the knowldgebase dopp-model +es_client = Elasticsearch([{'host':str(DOMAIN), 'port':str(PORT)}]) + class FoafPerson: def __init__(self, firstName, lastName, login = ""): @@ -533,9 +555,67 @@ class DoapProject(object): self.translator.append(fp) elif idDoapRole == 6: self.helper.append(fp) - except Exception, e: + except Exception as e: logger.error(str(e)) + + def save_to_index(self): + logger = Logger.getInstance() + try: + # self.description.encode('utf-8') might give UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 118: character maps to + try: + description = self.description.encode('utf-8') + except: + description = self.description + try: + name = self.name.encode('utf-8') + except: + name = self.name + try: + download_page = self.download_page.encode('utf-8') + except: + download_page = self.download_page + + i = 0 + knowledgebase = { + 'name': name, + 'shortdesc': self.shortdesc.encode('utf-8'), + 'description': description, + 'homepage': self.homepage.encode('utf-8'), + 'created': self.created, # format: 2006-03-27 + 'mailing_list': self.mailing_list.encode('utf-8'), + 'download_page': download_page, + 'bug_database': self.bug_database.encode('utf-8'), + 'platform': self.platform.encode('utf-8'), + 'service_endpoint': self.service_endpoint.encode('utf-8'), + 'audience': self.audience.encode('utf-8'), + 'blog': self.blog.encode('utf-8'), + 'old_homepage': StringList().load_plain(self.old_homepage).base64_encoded, + 'category': self.category.encode('utf-8'),#StringList().load_plain(self.category).base64_encoded, + 'license': StringList().load_plain(self.license).base64_encoded, + 'download_mirror': StringList().load_plain(self.download_mirror).base64_encoded, + 'wiki': StringList().load_plain(self.wiki).base64_encoded, + 'programming_language': StringList().load_plain(self.programming_language).base64_encoded, + 'os': StringList().load_plain(self.os).base64_encoded, + 'language': StringList().load_plain(self.language).base64_encoded, + 'releaseUrl':StringList().load_plain(self.release).base64.base64_encoded, + 'topicUrl':"", + 'labelUrl':"", + 'topic':"", + 'label':"", + 'typoRepo':"Apache" + } + i = i + 1 + if not es_client.indices.exists('values-knowdata'): + result = es_client.create(index='values-knowdata',body=knowledgebase,id = i) + else: + result = es_client.index(index='values-knowdata',body=knowledgebase, id = i) + except Exception as ex: + logger.error(' 2 - saving to index: ' + self.name) + traceback.print_exc() + + + def save_to_db(self): logger = Logger.getInstance() try: @@ -710,7 +790,7 @@ class DoapProject(object): def modified(dp_old, dp_new, attributes): logger = Logger.getInstance() #if there are important differences flag the project as changed - for attribute in attributes.keys(): + for attribute in list(attributes.keys()): if (hasattr(dp_old, attribute) and hasattr(dp_new, attribute)): old = getattr(dp_old, attribute) new = getattr(dp_new, attribute) @@ -782,4 +862,4 @@ class DoapProject(object): self.description = row[2] self.home = row[3] self.created = row[4] - \ No newline at end of file + diff --git a/web-crawler/example.py b/web-crawler/example.py new file mode 100755 index 0000000000000000000000000000000000000000..1e3b740e0242ffb34d7e7b37f7afbe50fa73ae7c --- /dev/null +++ b/web-crawler/example.py @@ -0,0 +1,39 @@ +import requests +from elasticsearch import Elasticsearch +import json + +es = Elasticsearch([{'host':'localhost', 'port':9200}]) +res=requests.get('http://localhost:9200') +i=0 +p="pippo" + +while i<200: + #print "while" + #print p+str(i) + e={ + "first_name":"asd"+str(i), + "last_name":"pafdfd"+str(i), + "age": 27+i, + "about": "Love to play football"+str(i), + "interests": ['sports','music'], + } + #print i + + if not es.indices.exists('test'): + es.create(index='test', body=e,id=i) + else: + es.index(index='test', body=e,id=i) + #print(r.status_code) + #break + i=i+1 + + query_all = {'size':300, 'query': {'match_all': {}}} + print query_all + + res= es.search(index='test',body=query_all) + #print res + for hit in res['hits']['hits']: + print hit['_source'] + #print hit['_score'] + #print hit['_id'] + print '**********************' diff --git a/web-crawler/github_data_fetcher.py b/web-crawler/github_data_fetcher.py index 4877c5561cefabe44b2f5d926c5e897b9de6b07c..12f6628b4033ea577eb3483dbd5afa871348b4df 100755 --- a/web-crawler/github_data_fetcher.py +++ b/web-crawler/github_data_fetcher.py @@ -4,69 +4,98 @@ # # Copyright 2014 Bitergium SLL -#MDG March 2021 -#Owner Engineering Ingegneria Informatica S.p.A. -#Update the code for the MORPHEMIC release 1.5 +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo +#Updated on March 2021 the code for the MORPHEMIC release 1.5 + -from doapfiend.doaplib import load_graph from generic_data_fetcher import GenericDataFetcher from Utils import Configuration, Logger, Sources from datetime import date, timedelta from CrawlerDatabase import CrawlerDatabase from subprocess import Popen +import subprocess, json import glob from calendar import monthrange -import json import os import traceback import time from dw_batch import DwBatch import requests from requests.sessions import session +import ast +import xml.etree.ElementTree as ET +from elasticsearch import helpers, Elasticsearch +import elasticsearch +import logging + +# import the Elasticsearch client library +from elasticsearch import exceptions + +# import JSON and time +import json, time +# create a timestamp using the time() method +start_time = time.time() + +# declare globals for the Elasticsearch client host +DOMAIN = "localhost" +PORT = 9200 +INDEX = "knowbase" +# concatenate a string for the client's host paramater +host = str(DOMAIN) + ":" + str(PORT) #+ "/values-knowdata" + +# declare an instance of the Elasticsearch library +client = Elasticsearch(host) + +#es_client used to create the knowldgebase dopp-model +es_client = Elasticsearch([{'host':str(DOMAIN), 'port':str(PORT)}]) -from webob import Response, exc, response, request -from __builtin__ import False -session = requests.Session() class GitHubDataFetcher(GenericDataFetcher): def __init__(self): + super(GitHubDataFetcher, self).__init__(Sources.Github) + if (es_client.ping()): + print("Yeah KnowledgeBase is up and running!") + else: + print ("Ops it could not up and running!") + def run(self): - - months_of_stats = 0 - dt = date.today() - batch = self.batch.id_batch - #print "init dopo batch" - while months_of_stats < Configuration.github_archive_months: - #go to first day of month - dt1 = dt.replace(day=1) - #back one day so I get previous month - dt = dt1 - timedelta(days=1) - year_minus_cursor = dt.year - month_minus_cursor = dt.month - gas = GithubArchiveStats(year_minus_cursor, month_minus_cursor,batch) - #do I have data for this month - parameters = { - 'stat_year': year_minus_cursor, - 'stat_month': month_minus_cursor - } - if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: - months_of_stats = months_of_stats + 1 - elif gas.filesAvailable(): - gas.fetchFiles() - gas.processFiles() - #print "prima di send data" - #gas.send_data() - #print "dopo send data" - #gas.get_response() - gas.limitBatchLength() - months_of_stats = months_of_stats + 1 - - + ''' + ''' + logging.basicConfig(level=logging.ERROR) + try: + #r = requests.get('http://localhost:9200') + months_of_stats = 0 + dt = date.today() + while months_of_stats < Configuration.github_archive_months: + dt1 = dt.replace(day=1) #go to first day of month + dt = dt1 - timedelta(days=1) #back one day so I get previous month + year_minus_cursor = dt.year + month_minus_cursor = dt.month + gas = GithubArchiveStats(year_minus_cursor, month_minus_cursor) + #do I have data for this month + parameters = { + 'stat_year': year_minus_cursor, + 'stat_month': month_minus_cursor + } + if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: + months_of_stats = months_of_stats + 1 + #scommenta dopo che hai risolto if gas.filesAvailable(): + gas.fetchFiles() + gas.processFiles() + #indenta dopo che hai risolto + #gas.doap_model() + #gas.deleteJsonFile() + months_of_stats = months_of_stats + 1 + except Exception as ex: + print((str(ex))) class GithubArchiveStats(): @@ -74,226 +103,19 @@ class GithubArchiveStats(): We fetch statistics from http://www.githubarchive.org/ an instance is taking care of a specific month in a year """ - def __init__(self, y, m,batch): - - logger = Logger.getInstance() - #Maria commented on March 2021 - self.batch = batch - #logger.info("Batch is : " + str(self.batch)) - - ''' - if not (batch is None): - self.batch = DwBatch() - self.batch.id_batch = batch - logger.info("self.batch.id_batch: " + str(self.batch.id_batch)) - self.batch.id_source = 1 - # I create an array of IDs of old batches for comparison; one for each source - self.old_batch_ids = self.batch.latest_batches(False) - ''' + def __init__(self, y, m): self.y = y self.m = m + self.mm = "%02d" % m + #self.idDWBatch = self.batch.id_batch dt = date.today() + dt1 = dt.replace(day=1) #go to first day of month dt = dt1 - timedelta(days=31) #add 31 days so I go to next month - #dt = dt1 - timedelta(days=31) #add 31 days so I go to next month self.mm_next_month = "%02d" % dt.month self.yyyy_next_month = "%02d" % dt.year - self.url = "http://127.0.0.1:6543/projects?update=0" - - - def htmlMonitor(self, html_text): - print fetcher_file - fetcher_file = open(Configuration.path_html + '/crawler/fetcher.html',"w") - print "dentro htmlMonitor" - os.chmod(Configuration.web_path_html + '/projects.json', 0o777) - fetcher_file.write(html_text) - fetcher_file.close() - - - ''' - def writeJson(self,a): - json_file = Configuration.path_html + '/projects.json' - fetcher_file = open(Configuration.path_html + '/crawler/fetcher.html',"w") - if os.path.isfile(json_file): - os.remove(json_file) - #print("File Removed!Now I'm going to create it") - fetcher_file = open(json_file,"w") - os.chmod(Configuration.path_html + '/projects.json', 0o777) - else: - fetcher_file = open(json_file,"w") - os.chmod(Configuration.path_html + '/projects.json', 0o777) - with fetcher_file as out_file: - json.dump(a, out_file,indent=2) - #json.dumps(a,out_file, indent=4) - - ''' - - #print('write file json ') - - - def send_data(self, updated): - #possiamo dopo inserimento in metadataproject spedire al - #knowledgebase: provare .... - if request.remote_addr != Configuration.KAaddress: - raise _401() - - try: - CrawlerDatabase.connect() - - except mysql.connector.Error as err: - print("Something went wrong: {}".format(err)) - - - - print "estraggo i miei metadata di idProject and updated=0" - - try: - cursor = CrawlerDatabase.execute_cursor("SELECT idProject,project, description, versionUrl,version, deploymentUrl, url, idDWBatch,updated FROM MetadataProject where updated='"+str(updated)+"'") - results = cursor.fetchall() - print results - if (results == ()): - print "There is no metadataproject to send. Please, wait....." - #Attenzione : questa operazione non deve essere simulata la risposta REST - #Spezzare il metodo in due parti restituire True o False o array vuoto - #se non ci sono i metadati allora il codice continua - #altrimenti gestisce con il rest - #return - else: - for record in results: - metadata = { - 'idProject' : record[0], - 'name' : record[1], - 'description' : record[2], - 'versionUrl' : record[3], - 'version' : record[4], - 'deploymentUrl' : record[5], - 'url' : record[6], - 'idDWBatch' : record[7], - 'updated' : record[8] - } - _VALUES['value'].append(metadata) - - l = len (_VALUES['metadata']) - - #soglia_max = 100 - soglia_max = 5 - num = l/soglia_max - resto = l%soglia_max - page = [[]] - i = 0 - num = num+1; - int_max =0; - while (i Configuration.max_batch_size: - logger.info("limitBatchLength Batch " + str(self.batch.id_batch) + " has " + str(batch_length) + " projects. I will be split to reduce its size.") - while batch_length > Configuration.max_batch_size: - # I load the current batch - current_batch = DwBatch() - current_batch.load(self.batch.id_batch) - # I create a new batch with same info as this one - child_batch = current_batch - # new one is a child of current batch - child_batch.id_parent = self.batch.id_batch - child_batch.id_batch = 0 - # insert to DB - child_batch.initialize(child_batch.id_source) - # write to db all fields - child_batch.writeToDB() - # now I have the new child_batch.id_batch - # see here for explanation of following query: http://stackoverflow.com/questions/1513206/update-multiple-rows-using-limit-in-mysql - #Logger.debug("Batch " + str(child_batch.id_batch) + " created as a split of " + str(self.batch.id_batch)) - logger.info("Batch " + str(child_batch.id_batch) + " created as a split of " + str(self.batch.id_batch)) - parUpdate = { - 'idDWBatchOLD': self.batch.id_batch, - 'idDWBatch': child_batch.id_batch, - 'max_batch_size' : Configuration.max_batch_size - } - sqlUpdate = "UPDATE DWBatch SET idDWBatch=%(idDWBatch)s WHERE " - sqlUpdate += " idDWBatch=%(idDWBatchOLD)s LIMIT %(max_batch_size)s) tmp)" - #Logger.debug("Batch " + str(child_batch.id_batch) + " created as a split of " + str(self.batch.id_batch)) - logger.info("Batch " + str(child_batch.id_batch) + " created as a split of " + str(self.batch.id_batch)) - CrawlerDatabase.execute(sqlUpdate, True, parUpdate) - # let's check again if there are too many projects - batch_length = CrawlerDatabase.select_int(sql, parameters) - + self.html_monitor_text = "Github
" @staticmethod def statsAvailable(): @@ -321,7 +143,7 @@ class GithubArchiveStats(): if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: how_many = how_many + 1 months_of_stats = months_of_stats + 1 - logger.debug("GithubArchiveStats.statsAvailable: Do we have any stats to process?" + str(how_many >= Configuration.github_archive_months)) ## DEBUG + #logger.debug("GithubArchiveStats.statsAvailable: Do we have any stats to process?" + str(how_many >= Configuration.github_archive_months)) ## DEBUG return how_many >= Configuration.github_archive_months @@ -341,329 +163,516 @@ class GithubArchiveStats(): Files are per hour with name: YEAR-MONTH-DAY-HOUR.json.gz """ logger = Logger.getInstance() - self.html_monitor_text = "Github Integrator
" - #Pavia: for day_iter in range(1, monthrange(self.y, self.m)[1] + 1): #number of days in this month for day_iter in range(1, 3): #number of days in this month - #for day_iter in range(1, 2): #number of days in this month - #Maria marzo 2021for day_iter in range(1, 4): - for hour_iter in range(24): + #Pavia: for hour_iter in range(24): - #for hour_iter in range(10, 12): - #Mariafor hour_iter in range(10, 11): + for hour_iter in range(10, 12): sz_day = "%02d" % day_iter sz_hour = str(hour_iter) - + if not os.path.isfile(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz"): proc = Popen("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz", shell=True, cwd=Configuration.github_file_path + "/gh") return_code = proc.wait() if return_code == 8: logger.error("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz" + " returned error code 8") + + def processFiles(self): logger = Logger.getInstance() project_name = "" release_id = "" - #logger.info("=== MA siamo dentro processFiles=====") + #print("ProcessFile") + #print Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json.gz" compressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json.gz") for compressed_file in compressed_files: - proc = Popen("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh") + proc = Popen("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh") return_code = proc.wait() - + if return_code == 8 : + logger.error("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh") + #uncompressed_files = glob.glob(Configuration.flossmole_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json") + + #print Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json" + #print ("===============================") uncompressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json") - #logger.info(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json") + #print("uncrompressed file") + #print uncompressed_files for uncompressed_file in uncompressed_files: + #print uncompressed_file + #if not(os.path.isfile(uncompressed_file)): with open(uncompressed_file) as f: - content = f.readlines() - for line in content: - try: - decoded = json.loads(line) - # GistEvent lines have no repository - if decoded["type"] != "GistEvent" : #not interested in Gists - #To speed up testing restrict to ReleaseEvent - #if decoded["type"] == "ReleaseEvent": - repo = decoded["repo"] - actor = decoded["actor"] - logger.debug("Parsing event type: " + decoded["type"] + " from project: " + repo["name"]) + content = f.readlines() + #print "CONTENT" + #print content + for line in content: try: - if decoded["type"] == "RepositoryEvent" and ( decoded["action"] == "created" or decoded["action"] == "edited" or decoded["action"] == "renamed" ): + decoded = json.loads(line) + #print "==== DECODED=====" + #print decoded + # GistEvent lines have no repository + if decoded["type"] != "GistEvent" : #not interested in Gists + #To speed up testing restrict to ReleaseEvent + #if decoded["type"] == "ReleaseEvent": + repo = decoded["repo"] + actor = decoded["actor"] try: - project_description = decoded["description"] - logger.debug("Found description:" + project_description + " for project: " + repo["name"]) - except: - project_description = "" - else: - project_description = "" - - #print("DEBUG!!") - #time.sleep(1.5) - ''' - if decoded["type"] == "PullRequestEvent" : - payload = decoded["payload"] - pull_request = payload["pull_request"] - deployments_url = pull_request["deployments_url"] - license = pull_request["license"] - language = pull_request["language"] - logger.debug("deploy " + deployment_url + " license " + license + " language " + language) - ''' - #Pavia: in questo pezzo di codice incrementa gli eventi relativi ad un progetto gia' conosciuto per il periodo preso in considerazione nelle statistiche - #se in precedenza abbiamo trovato una descrizione del progetto aggiorna il relativo campo - #print self.y - #print self.m - #print "BATCH in GitHUBINTEGRATOR ha valore " - #print self.batch - #print "BATCH in GitHUBINTEGRATOR" - #logger.info("At the moment developer is " + str(actor["login"]) + "for project" + repo["name"]) - parameters = { - 'project_name': repo["name"], - 'description': project_description, - 'stat_year': self.y, - 'stat_month': self.m, - 'idDWBatch' : str(self.batch) + if decoded["type"] == "RepositoryEvent" and ( decoded["action"] == "created" or decoded["action"] == "edited" or decoded["action"] == "renamed" ): + try: + project_description = decoded["description"] + #logger.debug("Found description:" + project_description + " for project: " + repo["name"]) + except: + project_description = "" + else: + project_description = "" + + parameters = { + 'project_name': repo["name"], + 'description': project_description, + 'stat_year': self.y, + 'stat_month': self.m + } + if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", parameters) > 0: - } - #print (str(parameters)) - if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", parameters) > 0: - - sqlString = "select stat_year from rc_gh_archive where project_name LIKE %(project_name)s" - - #if (CrawlerDatabase.select_int(sqlString,parameters) != self.y): - if parameters['description'] == "": #if description is empty I do not overwrite it as it might have been there in other events - CrawlerDatabase.execute("UPDATE rc_gh_archive SET event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", True, parameters) - #print "esco dall'if di update rc_gh_archive" - else: - CrawlerDatabase.execute("UPDATE rc_gh_archive SET description=%(description)s, event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", True, parameters) - #print "esco else di update rc_gh_archive" - #else: - #Maria February 2021 cancello il progetto associandogli l'anno piu recente - #print "boh forse non si deve fare in questo modo?" - # CrawlerDatabase.execute("DELETE FROM rc_gh_archive where stat_year = %(stat_year)s and project_name = %(project_name)s",False, parameters) - # CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month, idDWBatch) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s, %(idDWBatch)s) ON DUPLICATE KEY UPDATE description = %(description)s, event_count=1 ", False, parameters) - - else: - try: - #CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month, idDWBatch) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s, %(idDWBatch)s) ON DUPLICATE KEY UPDATE description = %(description)s, event_count=1 ", False, parameters) - CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month, idDWBatch) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s, %(idDWBatch)s)", True, parameters) - except: - traceback.print_exc() - # except Exception, ex: - #logger.error("Error insert rc_gh_archive : " + str(ex)) - #Pavia: se l'evento e' di tipo ReleaseEvent, qui parsiamo le informazioni necessarie per popolare la rc_gh_archive_release - if decoded["type"] == "ReleaseEvent": - #Pavia: l'"url" e l'"id" della release ora li troviamo sotto payload->release - payload = decoded["payload"] - release = payload["release"] - id = release["id"] - #developer_name': repo["name"].rsplit("/",1)[0], - #MDG : add to verify that the project with specific version doesn't exist yet into DB - parameters = { - 'project_name': repo["name"], - 'stat_year': self.y, - 'stat_month': self.m, - 'url': str(release["url"]), - 'version': str(release["id"]), - } - logger.info("Found release event for project: " + repo["name"] + ", release id: " + str(release["id"]) + ", release url: " + release["url"]) - - try: - #search if project with specific version yet exist in the DB - sql_query = "SELECT rel.project_name " - sql_query = sql_query + " FROM rc_gh_archive_release rel LEFT JOIN" - sql_query = sql_query + "(select project_name, idDWBatch FROM rc_gh_archive GROUP BY project_name)" - sql_query = sql_query + "as arc on rel.project_name=arc.project_name ORDER BY rel.project_name" - #print ("Search if project with specific version exist into the rc_gh_archive join with rc_gh_archive_release : ", sql_query) - project_name = CrawlerDatabase.select_natural(sql_query) - logger.info("project_name " + str(project_name)) - sqlString = "INSERT INTO rc_gh_archive_release (project_name, releaseUrl, version) VALUES (%(project_name)s, %(url)s, %(version)s)" - #sqlString = " ON DUPLICATE KEY UPDATE version=%(version)s, releaseUrl=%(url)s, developer_name=%(developer_name)s" - #logger.info("INSERT into rc_gh_archive_release is " + str(sqlString)) - #logger.info(str(parameters)) - #if the project with the specific version is not loaded then insert else no action is provided - if (project_name != repo["name"]): - logger.info("INSERT INTO rc_gh_archive") - #CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, releaseUrl, version, developer_name) VALUES (%(project_name)s, %(url)s, %(version)s, %(developer_name)s) ON DUPLICATE KEY UPDATE version=%(version)s, releaseUrl=%(url)s, developer_name=%(developer_name)s ", True, parameters) - CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, releaseUrl, version) VALUES (%(project_name)s, %(url)s, %(version)s)", True, parameters) - - except: - traceback.print_exc() - #here start the integration of the project information stored into rc_gh_archive and rc_gh_archive_release - logger.info("Now store metadata project") - start = time.time() - try: - logger.info("Starting Metadata Project Integration") - #Load the whole batch; for flossmole gh_projects table is dropped and created every time it is downloaded so we do not filter by idDWBatch - #all fields in gh_projects: datasource_id, description, developer_name, fork_number, forked, homepage, last_modified, open_issues, private, project_name, url, watchers, XML - parameters = { 'how_many': Configuration.github_top_projects_event_count } - sql_query = "SELECT rel.project_name, arc.description, rel.releaseUrl, rel.version,arc.idDWBatch " - sql_query = sql_query + " FROM rc_gh_archive_release rel LEFT JOIN" - sql_query = sql_query + " (select project_name, description,idDWBatch FROM rc_gh_archive GROUP BY project_name)" - sql_query = sql_query + " as arc on rel.project_name=arc.project_name ORDER BY rel.project_name" + sqlString = "select stat_year from rc_gh_archive where project_name LIKE %(project_name)s" + + if parameters['description'] == "": #if description is empty I do not overwrite it as it might have been there in other events + CrawlerDatabase.execute("UPDATE rc_gh_archive SET event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", True, parameters) + else: + CrawlerDatabase.execute("UPDATE rc_gh_archive SET description=%(description)s, event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", True, parameters) + + else: + try: + CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s)", True, parameters) + except: + traceback.print_exc() - logger.info("prima di cursor select rc_gh_archive_release") - - cursor = CrawlerDatabase.execute_cursor(sql_query,parameters) - #logger.info("cursor esegui cursor sqlquery parameters") - results = cursor.fetchall() - #for each project in the batch, sorted by project_name, there might be more than one row per project if there are more releases - current_project_name = "" - for record in results: - #id_batch = self.batch.id_batch - if record[4] != None: - revision = record[3] - release_url = record[2] - release_url = release_url.replace('/releases/tag/', '/archive/') - release_url = release_url.replace('/releases/', '/archive/') - # add .zip - release_url = release_url + ".zip" - #download_page = release_url + ".zip" - deployment_url = "https://api.github.com/repos/" + record[0] + "/deployments" - #repo url - url = "https://api.github.com/repos/" + record[0] - release = record[3].encode('utf-8') - project = record[0] - #developer = record[0] - - idDWBatch = str(record[4]) - #proviamo a creare il campo language : - #example: https://api.github.com/repos/jgtate/Hackathon2018 - #get repo json information : - ''' - r = requests.get(url) - decoded = json.loads(r.content) - language = decoded["language"] - ''' - ''' - languageUrl = decoded["languages_url"] - rl = requests.get(languageUrl) - tot_languages = encoded(rl.content) - ''' - - - parameters = { - 'project' : project, - 'description' : str(record[1]), - 'url' : url, - 'versionUrl' : release_url, - 'version' : release, - 'deploymentUrl' : deployment_url, - 'idDWBatch' : idDWBatch + if decoded["type"] == "ReleaseEvent": + payload = decoded["payload"] + release = payload["release"] + id = release["id"] + parameters = { + 'project_name': repo["name"], + 'stat_year': self.y, + 'stat_month': self.m, + 'url': str(release["url"]), + 'version': str(release["id"]), + 'developer':str(actor["login"]) + } + #search if project with specific version yet exist in the DB + sql_query = "SELECT rel.project_name FROM rc_gh_archive_release as rel, rc_gh_archive as arc " + sql_query = sql_query + " WHERE arc.project_name = '" + str(repo["name"]) + "' and rel.project_name=arc.project_name;" + #print sql_query + project_name = CrawlerDatabase.select_natural(sql_query) + #print("project name ", project_name) + #sqlString = "INSERT INTO rc_gh_archive_release (project_name, url, version) VALUES (%(project_name)s, %(url)s, %(version)s)" + #print sql_query + #qui dovrei evitare di inserire progetti gia esistenti per git + #se il progetto non esiste lo inserisco ma se esiste guardo la release + #IF the project is not stored INSERT into Crawler DB + #time.sleep(10) + if (project_name == None): + try: + CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version,developer_name) VALUES (%(project_name)s, %(url)s, %(version)s,%(developer)s)", True, parameters) + except: + #print "A" + traceback.print_exc() + else: + #ELSE if the release is not stored into Crawler DB update the project name row with the new release + sql_query = "SELECT count(*) FROM rc_gh_archive_release where project_name ='" +str(repo["name"]) + "' and version = '" +str(release["id"]) +"' ORDER BY project_name" + # print sql_query + num = CrawlerDatabase.select_int(sql_query) + #se la release non esiste ma esiste il progetto aggirono la release + #altrimenti non faccio nulla + #print("count is ", str(num)) + if (CrawlerDatabase.select_int(sql_query) == 0): + try: + #CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version,developer_name) VALUES (%(project_name)s, %(url)s, %(version)s,%(developer)s)", True, parameters) + + CrawlerDatabase.execute("UPDATE rc_gh_archive_release set version = '" + str(release["id"]) +"' where project_name = '" + str(repo["name"]) + "'",True) + except: + traceback.print_exc() + #print("Now create into kb the project fetched") + self.doap_model() + #time.sleep(5) + + except Exception as ex: + #print("first exception") + traceback.print_exc + except Exception as ex: + #print "E" + #logger.info(str(ex) + " 2 missing in " + str(line)) + traceback.print_exc + + +#START CREATION DOAP MODEL for GIT PROJECTS + #def create_index(self,index_name,know_body): + def create_index(self,index_name): + # index settings + settings = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + } + #"content":know_body - } + #Ignore 400 means to ignore "Index Already Exist" error. + #if not es_object.indices.exists(index_name): + es_client.index(index=index_name,body=settings) + #print('Created Index ', index_name) + return index_name + + #dismissed at the moment + def format_results(self,results): + + """Print results nicely: + doc_id) content + """ + data = [doc for doc in results['hits']['hits']] + return data + + def get_all_docs(self): + try: + # use the JSON library's dump() method for indentation + info = json.dumps(es_client.info(), indent=4) + # pass client object to info() method + #print ("Elasticsearch client info():", info) + except exceptions.ConnectionError as err: + print(("\nElasticsearch info() ERROR:", err)) + print(("\nThe client host:", host, "is invalid or cluster is not running")) + # change the client's value to 'None' if ConnectionError + #es_client = None + # valid client instance for Elasticsearch + if es_client != None: + # get all of the indices on the Elasticsearch cluster + all_indices = es_client.indices.get_alias("*") + # keep track of the number of the documents returned + doc_count = 0 + # iterate over the list of Elasticsearch indices + for num, index in enumerate(all_indices): + # declare a filter query dict object + match_all = { + #"size": 100, + "query": { + "match_all": {} + } + } + # make a search() request to get all docs in the index + resp = es_client.search( + index = index, + body = match_all, + scroll = '2s' # length of time to keep search context + ) + # keep track of pass scroll _id + old_scroll_id = resp['_scroll_id'] + # use a 'while' iterator to loop over document 'hits' + while len(resp['hits']['hits']): + # make a request using the Scroll API + resp = es_client.scroll( + scroll_id = old_scroll_id, + scroll = '2s' # length of time to keep search context + ) + # check if there's a new scroll ID + if old_scroll_id != resp['_scroll_id']: + print(("NEW SCROLL ID:", resp['_scroll_id'])) + # keep track of pass scroll _id + old_scroll_id = resp['_scroll_id'] + listOfAll = [] + # iterate over the document hits for each 'scroll' + for doc in resp['hits']['hits']: + listOfAll.append(doc['_source']) + doc_count += 1 + + print(("\nTOTAL DOC COUNT:", doc_count)) + print(("TOTAL TIME TO GET ALL DOCS:", time.time() - start_time, "seconds.")) + return listOfAll + + def searchQuery(self): + res= es_client.search(index='knowledgebase',doc_type='_doc',body={ + "query": { + "match_all": {} + } + #'query':{ + # 'match':{ + # "content":"kubernetes, docker,FPGA,serverless" + # }, + # 'match_phrase':{ + # "about":"kubernetes, docker, FPGA, serverless" + # } + # } + }) + + for hit in res['hits']['hits']: + print((hit['_index'])) + print((hit['_id'])) + print((hit['_type'])) + print((hit['_score'])) + print((hit['_source'])) + print((res['hits']["total"])) + return res + + def checkFileExistance(self, filePath): + try: + with open(filePath,'r') as f: + return True + except FileNotFoundError as e: + return False + except IOError as e: + return False + + ''' + generator to push bulk data from a JSON + file into an Elasticsearch index + ''' + def bulk_json_data(self, knowledgebase, _index, _id): + + #json_list = knowledgebase + #for doc in json_list: + # use a `yield` generator so that the data + # isn't loaded into memory + # print doc + #if '{"index"' not in doc: + yield { + "_index": _index, + "_id": _id, + "_source": knowledgebase + } + + def doap_model(self): + logger = Logger.getInstance() + try: + logger.info("Loading metadataproject into KnowledgeBase......") + #Load the whole batch; for flossmole gh_projects table is dropped and created every time it is downloaded so we do not filter by idDWBatch + #all fields in gh_projects: datasource_id, description, developer_name, fork_number, forked, homepage, last_modified, open_issues, private, project_name, url, watchers, XML + parameters = { 'how_many': Configuration.github_top_projects_event_count } + if not es_client.indices.exists('knowbase'): + self.create_index('knowbase') + + sql_query = "SELECT rel.project_name, arc.description, rel.url, rel.version " + sql_query = sql_query + " FROM rc_gh_archive_release rel LEFT JOIN" + sql_query = sql_query + " (select project_name, description FROM rc_gh_archive GROUP BY project_name)" + sql_query = sql_query + " as arc on rel.project_name=arc.project_name ORDER BY rel.project_name" + cursor = CrawlerDatabase.execute_cursor(sql_query,parameters) + results = cursor.fetchall() + #print("Loading ..............") + i = 0 + j = 0 + query_all = {'size':10000, 'query': {'match_all': {}}} + # get a response using the Search API + res = es_client.search(index="knowbase", body=query_all) + #res = es_client.search(index="knowbase", body={"query": {"match_all": {}}}) + + for hit in res['hits']['hits']: + print("hit if _id") + if (hit['_id'] == i): + #print((hit['_id'])) + j = i + 1 + else: + j = i + #print(("j ",j)) + i = j + #print(("now i is ",i)) + current_project_name = "" + + for record in results: + if record[0] != None: + try: + topic = [] + + programminglanguage = [] + name = str(record[0]) + name_index= name.find("/") + fullname = name[name_index+1:len(name)] + downloadPage = "https://api.github.com/repos/"+record[0]+ "/downloads" + homePage = "https://api.github.com/repos/" + record[0] + programminglanguage.append("https://api.github.com/repos/"+record[0]+ "/languages") + serviceEndPoint = "https://api.github.com/search/repositories?q=" +str(fullname) + licenseUrl = "https://api.github.com/repos/" + record[0] +"/license" + #data = json.loads(subprocess.check_output("curl -H 'Accept: application/vnd.github.mercy-preview+json' https://api.github.com/repos/" + str(record[0]) + "/topics >> " + Configuration.github_file_path + "/topics/" + str(fullname)+".json", shell=True, cwd=Configuration.github_file_path + "/topics")) + try: + #logger.info("Starting Doap Model Integration") + parameters = { 'how_many': Configuration.github_top_projects_event_count } + sql_query = "SELECT rel.project_name, arc.description, rel.url, rel.version " + sql_query = sql_query + " FROM rc_gh_archive_release rel LEFT JOIN" + sql_query = sql_query + " (select project_name, description FROM rc_gh_archive GROUP BY project_name)" + sql_query = sql_query + " as arc on rel.project_name=arc.project_name ORDER BY rel.project_name" + #print(sql_query) + #time.sleep(5) - #Beofre to insert the new data in MetadataProject , a comparison with the existing data in MetadataProject has been done. - #The following scenario has been identified - #1) if the new data exist and not updated -> no action - #2) if the new data are in a new version compared to old data : update a old data and the flag updated change from 0 to 1 - #3) if the new data don't exist in MEtadataProject -> insert into table and change the flag update from 0 t0 1 - #sqlQueryString = "SELECT project,developer, description, url,versionUrl,version,deploymentUrl FROM MetadataProject " - + cursor = CrawlerDatabase.execute_cursor(sql_query,parameters) + results = cursor.fetchall() + current_project_name = "" + for record in results: + if record[0] != None: + revision = str(record[3]) + release_url = str(record[2]) + deployment_url = "https://api.github.com/repos/" + record[0] + "/deployments" + url = "https://api.github.com/repos/" + record[0] + project = record[0] + programming_language = "https://api.github.com/repos/"+record[0]+ "/languages" + #idDWBatch = str(record[4]) + parameters = { + 'project' : project, + 'description' : str(record[1]), + 'url' : url, + 'versionUrl' : release_url, + 'version' : revision, + 'deploymentUrl' : deployment_url + } + #print "dopo parameter e rpima di parameters" + #time.sleep(5) + try: + + sqlQueryString = "SELECT version FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(project) + sqlQueryString = sqlQueryString + "' AND version= '" + str(revision) + sqlQueryString = sqlQueryString + "' AND versionUrl = '" + release_url + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(url) + sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "'" + #logger.info(sqlQueryString) + release = CrawlerDatabase.select_natural(sqlQueryString) + if release !=None: + print("release ",release, " is ") + #if a new release exists then update else nothing to do + if release > str(revision): + print(" > ", str(revision)) try: - sqlQueryString = "SELECT version FROM MetadataProject " - sqlQueryString = sqlQueryString + " WHERE project= '" + str(project) - sqlQueryString = sqlQueryString + "' AND version= '" + str(release) - sqlQueryString = sqlQueryString + "' AND versionUrl = '" + release_url - sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "' AND url = '" + str(url) + "'" + #sqlQueryString = "SELECT updated FROM MetadataProject " + #sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(project) + #sqlQueryString = sqlQueryString + "' AND version= '" + str(revision) + #sqlQueryString = sqlQueryString + "' AND versionUrl = '" + release_url + #sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "' AND homepage = '" + str(url) + "'" #logger.info(sqlQueryString) - release = CrawlerDatabase.select_natural(sqlQueryString) - if release !=None: - logger.info ("if release exist for the project " + str(project)) - logger.info("release esiste -> progetto esiste -> guardo il flag update ") - try: - sqlQueryString = "SELECT updated FROM MetadataProject " - sqlQueryString = sqlQueryString + " WHERE project= '" + str(project) - sqlQueryString = sqlQueryString + "' AND version= '" + str(release) - sqlQueryString = sqlQueryString + "' AND versionUrl = '" + release_url - sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "' AND url = '" + str(url) + "'" - logger.info(sqlQueryString) - updated = CrawlerDatabase.select_int(sqlQueryString) - logger.info ("updated is " + str(updated)) - ''' - BLOCCO PER IL CONTROLLO METADATAPROJECT spedito o no al KNOWLEDGEBASE - ''' - if (updated == 1): - logger.info("allora esiste il progetto ,se flag a 1 i dati sono stati gia spedit e metto flag a 0 " + str(updated) ) - #se updated uguale a zero ed esiste il progetto aggiorno tutto e update=1 pronto per essere spedito: - #if (updated>0): - sqlUpdate = "UPDATE MetadataProject SET project = '"+ str(project)+ "'," - if (len(record[1])>0): - sqlUpdate = sqlUpdate + " description = '"+ str(record[1]) + "', " - sqlUpdate = sqlUpdate + " url = '" + str(url) + "', versionUrl='" +str(release_url) + "', version='" + str(release) + "', deploymentUrl='"+str(deployment_url)+"', idDWBatch='"+str(idDWBatch)+"'" - sqlUpdate = sqlUpdate + ", updated = 0 WHERE project = '" +str(project) + "'" - logger.info(sqlUpdate) - try: - CrawlerDatabase.execute(sqlUpdate,True) - logger.info("UPDATE SUCCESS ") - except: - traceback.print_exc() - except Exception as ex: + #updated = CrawlerDatabase.select_int(sqlQueryString) + #logger.info ("updated is " + str(updated)) + #if (updated == 1): + sqlUpdate = "UPDATE MetadataProject SET project_name = '"+ str(project)+ "'," + if (len(record[1])>0): + sqlUpdate = sqlUpdate + " description = '"+ str(record[1]) + "', " + sqlUpdate = sqlUpdate + " homepage = '" + str(url) + "', versionUrl='" +str(release_url) + "', version='" + str(revision) + "', deploymentUrl='"+str(deployment_url) + sqlUpdate = sqlUpdate + "', updated = 1 WHERE project_name = '" +str(project) + "'" + print("here send email with the messsage tht a project is updated with a new version") + #logger.info(sqlUpdate) + try: + CrawlerDatabase.execute(sqlUpdate,True) + #logger.info("UPDATE SUCCESS ") + #self.elasitc_send_data() + except: traceback.print_exc() - else: - logger.info("a questo punto inseriamo il nostro project e flag updated a 0 , i dati sono pronti per essere spediti") - logger.info(project + " NOT EXIST. INSERT IT !") - sqlInsertMetadataProject = [] - sqlParamMetadataProject = [] - sqlStringMetadataProject = "" - #se il progetto non esiste allora fevo inserirlo e metto update a 0 - #qui fai insert coi dati nuovi e metti a 1 il flag updated - sqlInsertMetadataProject.insert(0,"INSERT INTO MetadataProject (project,") - sqlParamMetadataProject.insert(0,"VALUES (%(project)s,") - - if (len(record[1])>0): - sqlInsertMetadataProject.extend(" description,") - sqlParamMetadataProject.extend(" %(description)s, ") - - sqlInsertMetadataProject.extend(" url, versionUrl, version, deploymentUrl,idDWBatch,updated) ") - sqlParamMetadataProject.extend(" %(url)s, %(versionUrl)s, %(version)s, %(deploymentUrl)s, %(idDWBatch)s, 0)"); - sqlStringMetadataProject = ''.join (sqlInsertMetadataProject)+''.join (sqlParamMetadataProject) - logger.info("--- SQL STRING ---") - logger.info(sqlStringMetadataProject) - #time.sleep(5) - try: - logger.info("I'M GOING TO INSERT NEW METADATA PROJECT INFORMATION") - CrawlerDatabase.execute(sqlStringMetadataProject, True, parameters) - logger.info("INSERT SUCCESS") - #time.sleep(5) - except: - # printing stack trace - traceback.print_exc() - #self.send_data() - #self.get_response(updated) - # response = session.get('http://127.0.0.1:6543/project?updated='+str(updated)) - #response = request.patch('http://127.0.0.1:6543/projects?updated=0') - #print response.status_code - #response = requests.request(method='post','http://127.0.0.1:6543/configuration/knowdata') - - #if response.status_code == 200: - # data = response.json_body - # idProject = data['idProject'] - # cursor = CrawlerDatabase.execute_cursor("SELECT idProject FROM MetadataProject WHERE updated=0 ") - # results = cursor.fetchall() - # for record in results: - # CrawlerDatabase.execute("UPDATE MetadataProject SET updated=1 WHERE idProject ='"+str(record[0])+"'", True) - # return response.status_code - ''' - listOfProjects = [] - listOfProjects.append(parameters) - ''' - #json_data = TrivialJSONEncoder().encode(listOfProjects) - #print json_data - #self.writeJson(json_data) - - except Exception as ex: - logger.ino(str(ex)) - - - end = time.time() - print ("time is ", str(end - start)) - #self.send_data() - - except Exception, ex: - logger.error(str(ex)) - # do nothing; sometimes repository is missing - - except Exception, ex: - logger.info(str(ex) + " missing in " + line) - except Exception, ex: - logger.info(str(ex) + " missing in " + line) - \ No newline at end of file + traceback.print_exc() + else: + #logger.info("a questo punto inseriamo il nostro project e flag updated a 0 , i dati sono pronti per essere spediti") + logger.info(project + " NOT EXIST. INSERT IT !") + sqlInsertMetadataProject = [] + sqlParamMetadataProject = [] + sqlStringMetadataProject = "" + #se il progetto non esiste allora fevo inserirlo e metto update a 0 + #qui fai insert coi dati nuovi e metti a 1 il flag updated + sqlInsertMetadataProject.insert(0,"INSERT INTO MetadataProject (project_name,") + sqlParamMetadataProject.insert(0,"VALUES (%(project)s,") + if (len(record[1])>0): + sqlInsertMetadataProject.extend(" description,") + sqlParamMetadataProject.extend(" %(description)s, ") + + sqlInsertMetadataProject.extend(" homepage, versionUrl, version, deploymentUrl,updated, typeRepo) ") + sqlParamMetadataProject.extend(" %(url)s, %(versionUrl)s, %(version)s, %(deploymentUrl)s, 0, 'gitHub')"); + sqlStringMetadataProject = ''.join (sqlInsertMetadataProject)+''.join (sqlParamMetadataProject) + #print("INSERT") + #print(sqlStringMetadataProject) + try: + #logger.info("I'M GOING TO INSERT NEW METADATA PROJECT INFORMATION") + CrawlerDatabase.execute(sqlStringMetadataProject, True, parameters) + logger.info("INSERT SUCCESS") + #print("no send an email") + #logger.info("DOPO elastic send data") + + except: + # printing stack trace + traceback.print_exc() + + data = json.loads(subprocess.check_output("curl -u " + Configuration.githubUser +":" + Configuration.githubToken + " -H 'Accept: application/vnd.github.mercy-preview+json' https://api.github.com/repos/" + str(record[0]) + "/topics ", shell=True, cwd=Configuration.github_file_path + "/topics")) + #print("Adesso knowledgebase ????") + #print(str(data)) + if (data["names"] != []): #and labelData != []): + labelData = json.loads(subprocess.check_output("curl -u " + Configuration.githubUser +":" + Configuration.githubToken + " -H 'Accept: application/vnd.github.mercy-preview+json' https://api.github.com/repos/" + str(record[0]) +"/labels", shell=True, cwd=Configuration.github_file_path + "/topics")) + labels = [] + try : + for n in labelData: + labels.append(n['name']) + except: + labels = [] + #print("KNOWLEDGE BASE") + + #read my metadataproject + sqlQueryString = "SELECT count(*) FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(project) + sqlQueryString = sqlQueryString + "' AND version= '" + str(revision) + sqlQueryString = sqlQueryString + "' AND versionUrl = '" + release_url + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(url) + sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "'" + logger.info(sqlQueryString) + if (CrawlerDatabase.select_int(sqlQueryString) > 0): + sqlQueryString = "SELECT project_name,description, version,versionUrl, typeRepo FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project_name= '" + str(project) + sqlQueryString = sqlQueryString + "' AND version= '" + str(revision) + sqlQueryString = sqlQueryString + "' AND versionUrl = '" + release_url + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(url) + sqlQueryString = sqlQueryString + "' AND deploymentUrl='" + str(deployment_url) + "'" + logger.info(sqlQueryString) + cursor = CrawlerDatabase.execute_cursor(sqlQueryString) + results = cursor.fetchall() + for record in results: + knowledgebase={ + "projectName":str(record[0]), + "description":str(record[1]), + "created":str(date.today()), + "downloadPage":"https://api.github.com/repos/"+record[0]+ "/downloads", + "homepage":"https://api.github.com/repos/" + record[0], + "programmingLanguage":programminglanguage, + "release":{'url':"https://api.github.com/" + str(record[0]) +"/repos", + 'rel':{ + "revision":str(record[2]), + "name":str(record[3]), + "created":None} + }, + "serviceEndpoint":"https://api.github.com/search/repositories?q=" +str(record[0]), + "typeRepo":str(record[4]), + "topic":data["names"], + "label": labels, + "licenseUrl" :"https://api.github.com/repos/" + record[0] +"/license", + "repositoryUrl":"https://api.github.com/" + record[0] +"/repos" + + } + if not (es_client.indices.exists('knowbase')): + res = es_client.create(index='knowbase',body=knowledgebase,id=i) + else: + #print("Loading ..............") + # get a response using the Search API + #res = es_client.search(index="values-knowdata", body=query_all) + res = es_client.index(index='knowbase',body=knowledgebase,id=i) + #res =helpers.bulk(es_client, self.bulk_json_data(knowledgebase, "values-knowdata", i),refresh=True) + i = i + 1 + else: + logger.info("No data no insert to knowbase") + else: + print ("Project moved.") + except: + traceback.print_exc() + pass + except: + traceback.print_exc() + pass + #traceback.print_exc() + except Exception as ex: + #print "Second Exception for the try of Doap Module" + traceback.print_exc() + tot = self.get_all_docs() + #print(("TOT for getAllDocs ", tot)) + #print ("------------------------------------------------------------------") + #print(("TOTAL TIME FOR DOAP-MODEL:", time.time() - start_time, "seconds.")) + result = self.searchQuery() + #print(("RESUlT of SEARCHQUERY ", result)) + except Exception as ex: + traceback.print_exc() diff --git a/web-crawler/github_data_fetcherOld.py b/web-crawler/github_data_fetcherOld.py new file mode 100755 index 0000000000000000000000000000000000000000..2319bf122ec2a0ffefea6d615348d532853bb130 --- /dev/null +++ b/web-crawler/github_data_fetcherOld.py @@ -0,0 +1,617 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# Copyright 2014 Bitergium SLL + +#MDG March 2021 +#Owner Engineering Ingegneria Informatica S.p.A. +#Update the code for the MORPHEMIC release 1.5 + +from doapfiend.doaplib import load_graph +from generic_data_fetcher import GenericDataFetcher +from Utils import Configuration, Logger, Sources +from datetime import date, timedelta +from CrawlerDatabase import CrawlerDatabase +from subprocess import Popen +import subprocess, json +import glob +from calendar import monthrange +import os +import traceback +import time +from dw_batch import DwBatch +import requests +from requests.sessions import session +import ast + +from elasticsearch import helpers, Elasticsearch +import elasticsearch +import logging + +# import the Elasticsearch client library +from elasticsearch import exceptions + +# import JSON and time +import json, time +# create a timestamp using the time() method +start_time = time.time() + +# declare globals for the Elasticsearch client host +DOMAIN = "localhost" +PORT = 9200 +INDEX = "values-knowdata" +# concatenate a string for the client's host paramater +host = str(DOMAIN) + ":" + str(PORT) #+ "/values-knowdata" + +# declare an instance of the Elasticsearch library +client = Elasticsearch(host) + +#es_client used to create the knowldgebase dopp-model +es_client = Elasticsearch([{'host':str(DOMAIN), 'port':str(PORT)}]) + + + +class GitHubDataFetcher(GenericDataFetcher): + + def __init__(self): + + super(GitHubDataFetcher, self).__init__(Sources.Github) + if (es_client.ping()): + print "Yeah KnowledgeBase is up and running!" + else: + print "Ops it could not up and running!" + + + def run(self): + ''' + + ''' + logging.basicConfig(level=logging.ERROR) + try: + #r = requests.get('http://localhost:9200') + months_of_stats = 0 + dt = date.today() + while months_of_stats < Configuration.github_archive_months: + dt1 = dt.replace(day=1) #go to first day of month + dt = dt1 - timedelta(days=1) #back one day so I get previous month + year_minus_cursor = dt.year + month_minus_cursor = dt.month + gas = GithubArchiveStats(year_minus_cursor, month_minus_cursor) + #do I have data for this month + parameters = { + 'stat_year': year_minus_cursor, + 'stat_month': month_minus_cursor + } + if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: + months_of_stats = months_of_stats + 1 + #scommenta dopo che hai risolto if gas.filesAvailable(): + gas.fetchFiles() + gas.processFiles() + #indenta dopo che hai risolto + #gas.doap_model() + #gas.deleteJsonFile() + months_of_stats = months_of_stats + 1 + except Exception as ex: + print(str(ex)) + +class GithubArchiveStats(): + + """ + We fetch statistics from http://www.githubarchive.org/ + an instance is taking care of a specific month in a year + """ + def __init__(self, y, m): + self.y = y + self.m = m + + self.mm = "%02d" % m + #self.idDWBatch = self.batch.id_batch + dt = date.today() + + dt1 = dt.replace(day=1) #go to first day of month + dt = dt1 - timedelta(days=31) #add 31 days so I go to next month + self.mm_next_month = "%02d" % dt.month + self.yyyy_next_month = "%02d" % dt.year + self.html_monitor_text = "Github
" + + @staticmethod + def statsAvailable(): + """ + Returns true if there are N months of statistics in the local database out of the last N+1 months + where N = Configuration.github_archive_months + we look back N+1 months because testermonth's statistics will not be ready the during the first days + of the month; so it is ok to have the last N available even if yestermonth is not there + """ + logger = Logger.getInstance() + months_of_stats = 0 + how_many = 0 + date_cursor = date.today() + while months_of_stats <= Configuration.github_archive_months: + dt1 = date_cursor.replace(day=1) #go to first day of month + date_cursor = dt1 - timedelta(days=1) #back one day so I get previous month + year_minus_cursor = date_cursor.year + month_minus_cursor = date_cursor.month + #do I have data for this month + parameters = { + 'stat_year': year_minus_cursor, + 'stat_month': month_minus_cursor + } + + if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0: + how_many = how_many + 1 + months_of_stats = months_of_stats + 1 + #logger.debug("GithubArchiveStats.statsAvailable: Do we have any stats to process?" + str(how_many >= Configuration.github_archive_months)) ## DEBUG + return how_many >= Configuration.github_archive_months + + + def filesAvailable(self): + """ + Are files available at http://data.githubarchive.org + I assume all files for a month are available if first file of next month is available + """ + proc = Popen("wget http://data.githubarchive.org/" + self.yyyy_next_month + "-" + self.mm_next_month + "-01-0.json.gz", shell=True, cwd=Configuration.temporary_directory) + return_code = proc.wait() + if return_code == 8: + return False + return True + + def fetchFiles(self): + """ + Files are per hour with name: YEAR-MONTH-DAY-HOUR.json.gz + """ + logger = Logger.getInstance() + self.html_monitor_text = "Github Integrator
" + + #Pavia: for day_iter in range(1, monthrange(self.y, self.m)[1] + 1): #number of days in this month + for day_iter in range(3, 5): #number of days in this month + #for day_iter in range(1, 2): #number of days in this month + #Maria marzo 2021for day_iter in range(1, 4): + #Maria giugno 2021 for hour_iter in range(10,12): + for hour_iter in range(14,16): + #for hour_iter in range(10, 12): + sz_day = "%02d" % day_iter + sz_hour = str(hour_iter) + if not os.path.isfile(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz"): + proc = Popen("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz", shell=True, cwd=Configuration.github_file_path + "/gh") + #proc = Popen("wget http://data.githubarchive.org/2020-12-1.json.gz", shell=True, cwd=Configuration.github_file_path + "/gh") + + return_code = proc.wait() + if return_code == 8: + logger.error("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz" + " returned error code 8") + #self.processFiles() + + time.sleep(5) + def processFiles(self): + logger = Logger.getInstance() + project_name = "" + release_id = "" + print("ProcessFile") + #print Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json.gz" + compressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json.gz") + for compressed_file in compressed_files: + proc = Popen("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh") + return_code = proc.wait() + if return_code == 8 : + logger.error("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh") + #uncompressed_files = glob.glob(Configuration.flossmole_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json") + + #print Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json" + #print ("===============================") + uncompressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json") + #print("uncrompressed file") + #print uncompressed_files + for uncompressed_file in uncompressed_files: + #print uncompressed_file + #if not(os.path.isfile(uncompressed_file)): + with open(uncompressed_file) as f: + content = f.readlines() + #print "CONTENT" + #print content + for line in content: + try: + decoded = json.loads(line) + #print "==== DECODED=====" + #print decoded + # GistEvent lines have no repository + if decoded["type"] != "GistEvent" : #not interested in Gists + #To speed up testing restrict to ReleaseEvent + #if decoded["type"] == "ReleaseEvent": + repo = decoded["repo"] + actor = decoded["actor"] + try: + if decoded["type"] == "RepositoryEvent" and ( decoded["action"] == "created" or decoded["action"] == "edited" or decoded["action"] == "renamed" ): + try: + project_description = decoded["description"] + #logger.debug("Found description:" + project_description + " for project: " + repo["name"]) + except: + project_description = "" + else: + project_description = "" + + parameters = { + 'project_name': repo["name"], + 'description': project_description, + 'stat_year': self.y, + 'stat_month': self.m + } + if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", parameters) > 0: + + sqlString = "select stat_year from rc_gh_archive where project_name LIKE %(project_name)s" + + if parameters['description'] == "": #if description is empty I do not overwrite it as it might have been there in other events + CrawlerDatabase.execute("UPDATE rc_gh_archive SET event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", True, parameters) + else: + CrawlerDatabase.execute("UPDATE rc_gh_archive SET description=%(description)s, event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", True, parameters) + + else: + try: + CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s)", True, parameters) + except: + traceback.print_exc() + + if decoded["type"] == "ReleaseEvent": + payload = decoded["payload"] + release = payload["release"] + id = release["id"] + parameters = { + 'project_name': repo["name"], + 'stat_year': self.y, + 'stat_month': self.m, + 'url': str(release["url"]), + 'version': str(release["id"]), + 'developer':str(actor["login"]) + } + #search if project with specific version yet exist in the DB + sql_query = "SELECT rel.project_name FROM rc_gh_archive_release as rel, rc_gh_archive as arc " + sql_query = sql_query + " WHERE arc.project_name = '" + str(repo["name"]) + "' and rel.project_name=arc.project_name;" + #print sql_query + project_name = CrawlerDatabase.select_natural(sql_query) + #print("project name ", project_name) + #sqlString = "INSERT INTO rc_gh_archive_release (project_name, url, version) VALUES (%(project_name)s, %(url)s, %(version)s)" + #print sql_query + #qui dovrei evitare di inserire progetti gia esistenti per git + #se il progetto non esiste lo inserisco ma se esiste guardo la release + #IF the project is not stored INSERT into Crawler DB + #time.sleep(10) + if (project_name == None): + try: + CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version,developer_name) VALUES (%(project_name)s, %(url)s, %(version)s,%(developer)s)", True, parameters) + except: + #print "A" + traceback.print_exc() + else: + #ELSE if the release is not stored into Crawler DB update the project name row with the new release + sql_query = "SELECT count(*) FROM rc_gh_archive_release where project_name ='" +str(repo["name"]) + "' and version = '" +str(release["id"]) +"' ORDER BY project_name" + # print sql_query + num = CrawlerDatabase.select_int(sql_query) + #se la release non esiste ma esiste il progetto aggirono la release + #altrimenti non faccio nulla + #print("count is ", str(num)) + if (CrawlerDatabase.select_int(sql_query) == 0): + try: + #CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version,developer_name) VALUES (%(project_name)s, %(url)s, %(version)s,%(developer)s)", True, parameters) + + CrawlerDatabase.execute("UPDATE rc_gh_archive_release set version = '" + str(release["id"]) +"' where project_name = '" + str(repo["name"]) + "'",True) + except: + traceback.print_exc() + print("Now create into kb the project fetched") + self.doap_model() + #time.sleep(5) + + except Exception, ex: + print("first exception") + #logger.info(str(ex) + " 1 missing in " + str(line)) + #time.sleep(5) + traceback.print_exc + except Exception, ex: + #print "E" + #logger.info(str(ex) + " 2 missing in " + str(line)) + traceback.print_exc + + #else: + #I can removed the file because the project has been processed + #os.remove(uncompressed_file) + # print "" + #print (str(uncompresos.remove(uncompressed_file)sed_file) + " has been removed.") + + ''' + def deleteJsonFile(self): + jsonFile = Configuration.github_file_path + "/gh" + os.remove(jsonFile) + ''' +#START CREATION DOAP MODEL for GIT PROJECTS + #def create_index(self,index_name,know_body): + def create_index(self,index_name): + # index settings + settings = { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + } + #"content":know_body + + #Ignore 400 means to ignore "Index Already Exist" error. + #if not es_object.indices.exists(index_name): + es_client.index(index=index_name,body=settings) + #print('Created Index ', index_name) + return index_name + + #dismissed at the moment + def format_results(self,results): + + """Print results nicely: + doc_id) content + """ + data = [doc for doc in results['hits']['hits']] + ''' + for d in data: + if (keywords=='license'): + print("----- 5 -----") + print(d['_id'], d['_source']['license']) + else: + print("---- versionUrl ----") + print(d['_source']['versionUrl']) + ''' + return data + + def get_all_docs(self): + try: + # use the JSON library's dump() method for indentation + info = json.dumps(es_client.info(), indent=4) + # pass client object to info() method + #print ("Elasticsearch client info():", info) + except exceptions.ConnectionError as err: + print ("\nElasticsearch info() ERROR:", err) + print ("\nThe client host:", host, "is invalid or cluster is not running") + # change the client's value to 'None' if ConnectionError + #es_client = None + # valid client instance for Elasticsearch + if es_client != None: + # get all of the indices on the Elasticsearch cluster + all_indices = es_client.indices.get_alias("*") + # keep track of the number of the documents returned + doc_count = 0 + # iterate over the list of Elasticsearch indices + for num, index in enumerate(all_indices): + # declare a filter query dict object + match_all = { + "size": 100, + "query": { + "match_all": {} + } + } + # make a search() request to get all docs in the index + resp = es_client.search( + index = index, + body = match_all, + scroll = '2s' # length of time to keep search context + ) + # keep track of pass scroll _id + old_scroll_id = resp['_scroll_id'] + # use a 'while' iterator to loop over document 'hits' + while len(resp['hits']['hits']): + # make a request using the Scroll API + resp = es_client.scroll( + scroll_id = old_scroll_id, + scroll = '2s' # length of time to keep search context + ) + # check if there's a new scroll ID + if old_scroll_id != resp['_scroll_id']: + print ("NEW SCROLL ID:", resp['_scroll_id']) + # keep track of pass scroll _id + old_scroll_id = resp['_scroll_id'] + listOfAll = [] + # iterate over the document hits for each 'scroll' + for doc in resp['hits']['hits']: + listOfAll.append(doc['_source']) + doc_count += 1 + + print ("\nTOTAL DOC COUNT:", doc_count) + print ("TOTAL TIME TO GET ALL DOCS:", time.time() - start_time, "seconds.") + return listOfAll + + def searchQuery(self): + res= es_client.search(index='values-knowdata',doc_type='_doc',body={ + "query": { + "match_all": {} + } + #'query':{ + # 'match':{ + # "content":"kubernetes, docker,FPGA,serverless" + # }, + # 'match_phrase':{ + # "about":"kubernetes, docker, FPGA, serverless" + # } + # } + }) + + for hit in res['hits']['hits']: + print hit['_index'] + print hit['_id'] + print hit['_type'] + print hit['_score'] + print hit['_source'] + print(res['hits']["total"]) + return res + + def checkFileExistance(self, filePath): + try: + with open(filePath,'r') as f: + return True + except FileNotFoundError as e: + return False + except IOError as e: + return False + + ''' + generator to push bulk data from a JSON + file into an Elasticsearch index + ''' + def bulk_json_data(self, knowledgebase, _index, _id): + + #json_list = knowledgebase + #for doc in json_list: + # use a `yield` generator so that the data + # isn't loaded into memory + # print doc + #if '{"index"' not in doc: + yield { + "_index": _index, + "_id": _id, + "_source": knowledgebase + } + + def doap_model(self): + logger = Logger.getInstance() + try: + logger.info("Starting Doap Model Integration") + #Load the whole batch; for flossmole gh_projects table is dropped and created every time it is downloaded so we do not filter by idDWBatch + #all fields in gh_projects: datasource_id, description, developer_name, fork_number, forked, homepage, last_modified, open_issues, private, project_name, url, watchers, XML + parameters = { 'how_many': Configuration.github_top_projects_event_count } + if not es_client.indices.exists('values-knowdata'): + self.create_index('values-knowdata') + + sql_query = "SELECT rel.project_name, arc.description, rel.url, rel.version " + sql_query = sql_query + " FROM rc_gh_archive_release rel LEFT JOIN" + sql_query = sql_query + " (select project_name, description FROM rc_gh_archive GROUP BY project_name)" + sql_query = sql_query + " as arc on rel.project_name=arc.project_name ORDER BY rel.project_name" + cursor = CrawlerDatabase.execute_cursor(sql_query,parameters) + results = cursor.fetchall() + print("Loading ..............") + current_project_name = "" + + for record in results: + if record[0] != None: + try: + topic = [] + + programminglanguage = [] + name = str(record[0]) + name_index= name.find("/") + fullname = name[name_index+1:len(name)] + downloadPage = "https://api.github.com/repos/"+record[0]+ "/downloads" + homePage = "https://api.github.com/repos/" + record[0] + programminglanguage.append("https://api.github.com/repos/"+record[0]+ "/languages") + serviceEndPoint = "https://api.github.com/search/repositories?q=" +str(fullname) + licenseUrl = "https://api.github.com/repos/" + record[0] +"/license" + + #data = json.loads(subprocess.check_output("curl -H 'Accept: application/vnd.github.mercy-preview+json' https://api.github.com/repos/" + str(record[0]) + "/topics >> " + Configuration.github_file_path + "/topics/" + str(fullname)+".json", shell=True, cwd=Configuration.github_file_path + "/topics")) + data = json.loads(subprocess.check_output("curl -u " + Configuration.githubUser +":" + Configuration.githubToken + " -H 'Accept: application/vnd.github.mercy-preview+json' https://api.github.com/repos/" + str(record[0]) + "/topics ", shell=True, cwd=Configuration.github_file_path + "/topics")) + try: + if (data["names"] != []): #and labelData != []): + labelData = json.loads(subprocess.check_output("curl -u " + Configuration.githubUser +":" + Configuration.githubToken + " -H 'Accept: application/vnd.github.mercy-preview+json' https://api.github.com/repos/" + str(record[0]) +"/labels", shell=True, cwd=Configuration.github_file_path + "/topics")) + labels = [] + try : + for n in labelData: + labels.append(n['name']) + except: + labels = [] + #print label + releaseurl = "https://api.github.com/repos/" + record[0] + "/releases/"+str(record[3]) + downloadPage = "https://api.github.com/repos/"+record[0]+ "/downloads" + print "KNOWLEDGE BASE" + parameters={ + "projectName":str(record[0]), + "description":str(record[1]), + "created":str(date.today()), + "downloadPage":"https://api.github.com/repos/"+record[0]+ "/downloads", + "homepage":"https://api.github.com/repos/" + record[0], + "programmingLanguage":programminglanguage, + "release":{'url':"https://api.github.com/repos/" + record[0] + "/releases/"+str(record[3]), + 'rel':{ + "revision":str(record[3]), + "name":str(fullname), + "created":None} + }, + "serviceEndpoint":"https://api.github.com/search/repositories?q=" +str(record[0]), + "typeRepo":"GitHub", + "topic":data["names"], + "label": labels, + "licenseUrl" :"https://api.github.com/repos/" + record[0] +"/license", + "repositoryUrl":"https://api.github.com/" + record[0] +"/repos" + } + + sqlQueryString = "SELECT release FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project= '" + str(record[0]) + sqlQueryString = sqlQueryString + "' AND version= '" + str(record[3]) + sqlQueryString = sqlQueryString + "' AND versionurl = '" + releaseurl + sqlQueryString = sqlQueryString + "' AND downloadpage='" + str(downloadPage) + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(homePage) + "'" + #logger.info(sqlQueryString) + release = CrawlerDatabase.select_natural(sqlQueryString) + + if release !=None: + try: + sqlQueryString = "SELECT updated FROM MetadataProject " + sqlQueryString = sqlQueryString + " WHERE project= '" + str(record[0]) + sqlQueryString = sqlQueryString + "' AND version= '" + str(record[3]) + sqlQueryString = sqlQueryString + "' AND versionurl = '" + releaseurl + sqlQueryString = sqlQueryString + "' AND downloadpage='" + str(downloadPage) + sqlQueryString = sqlQueryString + "' AND homepage = '" + str(homepage) + "'" + + #logger.info(sqlQueryString) + updated = CrawlerDatabase.select_int(sqlQueryString) + if (updated == 1): + + sqlUpdate = "UPDATE MetadataProject SET project = '"+ str(record[0])+ "'," + if (len(record[1])>0): + sqlUpdate = sqlUpdate + " description = '"+ str(record[1]) + "', " + sqlUpdate = sqlUpdate + " homepage = '" + str(homepage) + sqlUpdate = sqlUpdate + "', versionurl='" +str(releaseurl) + sqlUpdate = sqlUpdate + "', version='" + str(record[3]) + sqlUpdate = sqlUpdate + "', downloadpage='"+str(downloadPage) + sqlUpdate = sqlUpdate + "', created='"+str(date.today()) + sqlUpdate = sqlUpdate + "', programming_language='"+str(programminglanguage) + sqlUpdate = sqlUpdate + "', serviceEndpoint='https://api.github.com/search/repositories?q='" +str(record[0]) + sqlUpdate = sqlUpdate + "', typeRepo='GitHub'" + sqlUpdate = sqlUpdate + "', topic='"+str(data['names']) + sqlUpdate = sqlUpdate + "', licenseUrl='https://api.github.com/repos/" + record[0] +"/license" + sqlUpdate = sqlUpdate + "', label='"+str(labels) + sqlUpdate = sqlUpdate + "', repositoryUrl='https://api.github.com/" + record[0] +"/repos' " + sqlUpdate = sqlUpdate + ", updated = 0 WHERE project = '" +str(record[0]) + "'" + logger.info(sqlUpdate) + try: + CrawlerDatabase.execute(sqlUpdate,True) + logger.info("UPDATE SUCCESS ") + except: + traceback.print_exc() + except Exception as ex: + traceback.print_exc() + else: + sqlInsertMetadataProject = [] + sqlParamMetadataProject = [] + sqlStringMetadataProject = "" + #se il progetto non esiste allora fevo inserirlo e metto update a 0 + #qui fai insert coi dati nuovi e metti a 1 il flag updated + sqlInsertMetadataProject.insert(0,"INSERT INTO MetadataProject (project,") + sqlParamMetadataProject.insert(0,"VALUES (%(project)s,") + + if (len(record[1])>0): + sqlInsertMetadataProject.extend(" description,") + sqlParamMetadataProject.extend(" %(description)s, ") + + sqlInsertMetadataProject.extend(" url, versionUrl, version, deploymentUrl,idDWBatch,updated) ") + sqlParamMetadataProject.extend(" %(url)s, %(versionUrl)s, %(version)s, %(deploymentUrl)s, %(idDWBatch)s, 0)"); + sqlStringMetadataProject = ''.join (sqlInsertMetadataProject)+''.join (sqlParamMetadataProject) + try: + CrawlerDatabase.execute(sqlStringMetadataProject, True, parameters) + except: + # printing stack trace + traceback.print_exc() + + + else: + print "non considero" + except: + traceback.print_exc() + print "except allora non considero e passo" + pass + #traceback.print_exc() + except Exception as ex: + #print "Second Exception for the try of Doap Module" + traceback.print_exc() + + except Exception as ex: + traceback.print_exc() \ No newline at end of file diff --git a/web-crawler/jquery_plugin_data_fetcher.py b/web-crawler/jquery_plugin_data_fetcher.py index 845dd2b326c3ac849672ead6307b4f11e2535bb2..258187213d26e5a234be175b6112e7547a553594 100755 --- a/web-crawler/jquery_plugin_data_fetcher.py +++ b/web-crawler/jquery_plugin_data_fetcher.py @@ -2,17 +2,47 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. #Copyright 2014 Bitergium SLL +#Updated on March 2020 for MORPHEMIC projecct release 1.0 +#Last updated on July2021 +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo +import urllib.request, urllib.error, urllib.parse +#from subprocess import Popen +import sys, time, traceback +import ast +import elasticsearch +import logging +import doap_project from generic_data_fetcher import GenericDataFetcher from CrawlerDatabase import CrawlerDatabase -from Utils import Logger, Configuration, Sources, UrllibHelper -from lxml import html -import urllib2 -#from subprocess import Popen -import sys -from lxml import etree -from _mysql import NULL -from asn1crypto._ffi import null +from Utils import Logger, Configuration, Sources, UrllibHelper,DateHelper +from lxml import html,etree +from elasticsearch import helpers, Elasticsearch +from doap_project import FoafPerson +from datetime import date, timedelta +# import the Elasticsearch client library +from elasticsearch import exceptions + +# create a timestamp using the time() method +start_time = time.time() + +# declare globals for the Elasticsearch client host +DOMAIN = "localhost" +PORT = 9200 +INDEX = "knowbase" +# concatenate a string for the client's host paramater +host = str(DOMAIN) + ":" + str(PORT) #+ "/values-knowdata" + +# declare an instance of the Elasticsearch library +client = Elasticsearch(host) + +#es_client used to create the knowldgebase dopp-model +es_client = Elasticsearch([{'host':str(DOMAIN), 'port':str(PORT)}]) + + class JQueryPluginDataFetcher(GenericDataFetcher): ''' @@ -23,6 +53,11 @@ class JQueryPluginDataFetcher(GenericDataFetcher): def __init__(self): #commented by maria super(JQueryPluginDataFetcher, self).__init__(source) super(JQueryPluginDataFetcher, self).__init__(Sources.JQueryPlugin) + if (es_client.ping()): + print("Yeah KnowledgeBase is up and running!") + else: + print("Ops it could not up and running!") + def run(self): ''' There seems not to be a listing of all projects on the site; we use very generic queries to list all @@ -30,14 +65,15 @@ class JQueryPluginDataFetcher(GenericDataFetcher): a project that lies outside these two queries ''' logger = Logger.getInstance() - logger.info("run run ma dove siamo?") #Maria giugno 2020 self.parseQuery("i") #Maria giugno 2020 self.parseQuery("e") #uso questa query https://www.npmjs.com/search?q=keywords:jquery-plugin self.parseTag("i") + #self.doap_model() #https://plugins.jquery.com/?s= #self.parseQuery("jquery") + def parseTag(self, tag_string): ''' ''' @@ -49,7 +85,7 @@ class JQueryPluginDataFetcher(GenericDataFetcher): A paginated list of projects. It can be the result of a query, the list of projects with a tag, ... ''' logger = Logger.getInstance() - logger.info("Starting JQuery Plugin Project Page") + #logger.info("Starting JQuery Plugin Project Page") try: # I extract the number of pages the results are split into # and I loop parsing each page @@ -58,14 +94,171 @@ class JQueryPluginDataFetcher(GenericDataFetcher): current_plp = JQueryPluginListPage(jqplp.search_string_or_tag, i, is_a_tag) if current_plp.project_urls != None : for project_url in current_plp.project_urls: - #print "project url : " + #print("project url : ") jqppp = JQueryPluginProjectPage(project_url) p = jqppp.project() p.idDWBatch = self.batch.id_batch p.save() + self.doap_model() except TypeError as e: - print e - print sys.exc_type + #logger.info("you are here") + traceback.print_exc() + #print(e) + print((sys.exc_info()[0])) + + ''' + generator to push bulk data from a JSON + file into the knowledgebase index + ''' + def bulk_json_data(self, knowledgebase, _index, _id): + + yield { + "_index": _index, + "_id": _id, + "_source": knowledgebase + } + + + def doap_model(self): + logger = Logger.getInstance() + + try: + #Load the whole batch + #parameters = { 'idDWBatch': self.batch.id_batch } + cursor = CrawlerDatabase.execute_cursor("SELECT idRAW_JQ_Project, entry_title, attribution, description, download_link FROM RAW_JQ_Project ") + results = cursor.fetchall() + #for each project in the batch + for record in results: + idRAW_JQ_Project = record[0] + name = record[1] + shortdesc = "" + description = record[3] + homepage = "" + old_homepage = [] + tmp = DateHelper("") + created = tmp.date + download_page = record[4] + mailing_list = "" + category = [] + parameters = { + 'idRAW_JQ_Project': idRAW_JQ_Project + } + tmp = DateHelper("") + stringa = "SELECT t.name FROM RAW_JQ_ProjectTag pt JOIN RAW_JQ_Tag t ON pt.idRAW_JQ_Tag=t.idRAW_JQ_Tag WHERE pt.idRAW_JQ_Project='" + str(idRAW_JQ_Project) + "'" + #print(stringa) + curCat = CrawlerDatabase.execute_cursor("SELECT t.name FROM RAW_JQ_ProjectTag pt JOIN RAW_JQ_Tag t ON pt.idRAW_JQ_Tag=t.idRAW_JQ_Tag WHERE pt.idRAW_JQ_Project=%(idRAW_JQ_Project)s", parameters) + resultsCat = curCat.fetchall() + for recordCat in resultsCat: + category.append(str(recordCat[0])) + #dp_new.category.append(str(recordCat[0])) + os = [] + programming_language = [] + licenseUrl = [] + stringauno ="SELECT l.name FROM RAW_JQ_ProjectLicense pl JOIN RAW_JQ_License l ON pl.idRAW_JQ_License=l.idRAW_JQ_License WHERE pl.idRAW_JQ_Project='" + str(idRAW_JQ_Project) + "'" + #print(stringauno) + curLicense = CrawlerDatabase.execute_cursor("SELECT l.name FROM RAW_JQ_ProjectLicense pl JOIN RAW_JQ_License l ON pl.idRAW_JQ_License=l.idRAW_JQ_License WHERE pl.idRAW_JQ_Project=%(idRAW_JQ_Project)s", parameters) + resultsLicense = curLicense.fetchall() + for recordLicense in resultsLicense: + #dp_new.license.append(str(recordLicense[0])) + licenseUrl.append(str(recordLicense[0])) + #dp_new.language = [] + language = [] + # I need the latest version so that I can guess the download link for earlier versions; e.g.: + # latest version 1.9.4 + # download link http://github.com/CSS-Tricks/AnythingSlider/zipball/v1.9.4 + # replacing "1.9.4" with "1.9.3" and so on.. I get download links to older versions + # often the download link does not contain the latest version name though + latestRelease = CrawlerDatabase.select_string("SELECT version, max(date) FROM RAW_JQ_Version WHERE idRAW_JQ_Project=%(idRAW_JQ_Project)s ", parameters) + release = [] + + #dp_new.release = [] + curRelease = CrawlerDatabase.execute_cursor("SELECT version, date FROM RAW_JQ_Version WHERE idRAW_JQ_Project=%(idRAW_JQ_Project)s ", parameters) + resultsRelease = curRelease.fetchall() + for recordRelease in resultsRelease: + #dv = DoapVersion() + try : + releaseName = recordRelease[0] + release.append(releaseName) + + except: + release = [] + tmp = DateHelper("") + tmp.load_from_jquery_format(recordRelease[1]) + created = str(tmp.date) + revision = "" + file_release = [] + if latestRelease in download_page: + file_release.append(download_page.replace(latestRelease, releaseName)) + #release.append(file_release) + + #print releaseName + download_mirror = [] + wiki = [] + bug_database = "" + + + developer = [] + documenter = [] + helper = [] + tester = [] + translator = [] + maintainer = [] + curMember = CrawlerDatabase.execute_cursor("SELECT m.name FROM RAW_JQ_ProjectMaintainer pm JOIN RAW_JQ_Maintainer m ON pm.idRAW_JQ_Maintainer=m.idRAW_JQ_Maintainer WHERE pm.idRAW_JQ_Project=%(idRAW_JQ_Project)s ", parameters) + resultsMember = curMember.fetchall() + for recordMember in resultsMember: + names = recordMember[0].split(" ") + fp = FoafPerson(names[0], " ".join(names[1:]), "N/A") + maintainer.append(fp) + + platform = "" + service_endpoint = "" + audience = "" + blog = "" + modified = False + modified_release = False + id_batch = self.batch.id_batch + knowledgebase={ + "projectName":name, + "description":str(description), + "created":str(date.today()), + "downloadPage":download_page, + "homepage":homepage, + "programmingLanguage":programming_language, + "release":{'url':"", + 'rel':{ + "revision":release, + "name":"", + "created":created} + }, + "serviceEndpoint":service_endpoint, + "typeRepo":"jQueryPlugin", + "topic":category, + "label":None, + "licenseUrl" : licenseUrl, + "repositoryUrl":"https://github.com/jquery/" + str(name) + } + logger.info("Loading metadata project into KnowledgeBase......") + try: + if not (es_client.indices.exists(INDEX)) : + res = es_client.indices.create(index=INDEX,body=knowledgebase,id=i) + else: + # get a response using the Search API + # Check the number of documents in your index + c = es_client.count(index=INDEX) + #print(c['count']) + i = c['count'] + 1 + #print(es_client.count(index="knowledgebase")) + helpers.bulk(es_client, self.bulk_json_data(knowledgebase, INDEX, i),refresh=True) + i = i + 1 + + except: + traceback.print_exc() + + # mark the batch as integrated + #self.batch.integrate() + except Exception as ex: + traceback.print_exc() + #print('Error integrating: ' + str(ex)) class JQueryPluginListPage(): ''' @@ -78,18 +271,20 @@ class JQueryPluginListPage(): ''' '''maria 2020 : https://www.npmjs.com/search?q=keywords:jquery-plugin ''' + #print("Now you're into JQUERY PLUGIN LIST PAGE class") self.is_a_tag = is_a_tag self.search_string_or_tag = search_string_or_tag self.page_number = page_number self.url = self.urlOfNthPage(str(self.page_number)) + #print("seldurl is ",str(self.url)) hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} - req = urllib2.Request(self.url, headers=hdr) - html_page = urllib2.urlopen(req) + req = urllib.request.Request(self.url, headers=hdr) + html_page = urllib.request.urlopen(req) self.tree = html.fromstring(html_page.read()) self.project_urls = self.tree.xpath('//h2[@class="entry-title"]/a/@href') @@ -113,9 +308,12 @@ class JQueryPluginListPage(): self.is_a_query = False http://plugins.jquery.com/tag/ui/ --> http://plugins.jquery.com/tag/ui/page/2/ self.is_a_query = True http://plugins.jquery.com/?s=i --> http://plugins.jquery.com/page/2/?s=i ''' + #print("now yu're into urlOfNthPage of JqueryListPage") if self.is_a_tag: + #print("now into if with i ",str(i)) return "https://plugins.jquery.com/tag/ui/page/" + str(i) + "/" else: + #print("in else ") return "https://jquery.com/" @@ -126,7 +324,8 @@ class JQueryPluginProjectPage(): def __init__(self, url): ''' ''' - self.url = "http:" + url + self.url = "http:" + str(url) + #print(self.url) #Maria 2020 questo sembra non funzionare html_page = UrllibHelper.urlopen(self.url) #Maria 2020 questo sembra non funzionare self.tree = html.fromstring(html_page) #Maria 2020 : quando estraimo dall'index.html il link al plugin di jquery ui , viene @@ -143,13 +342,17 @@ class JQueryPluginProjectPage(): 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} - req = urllib2.Request(self.url,"",hdr) - html_page = urllib2.urlopen(req) + req = urllib.request.Request(self.url, headers=hdr) + html_page = urllib.request.urlopen(req) + + #req = urllib.request.Request(self.url,"",hdr) + #print(req) + html_page = urllib.request.urlopen(req) self.tree = html.fromstring(html_page.read()) def project(self): logger = Logger.getInstance() - logger.info("-----> Starting JQuery Plugin Project Page") + #logger.info("-----> Starting JQuery Plugin Project Page") try: self.tree.xpath('//h2[@class="entry-title"]/a/text()') jqpp = JQueryPluginProject() @@ -162,6 +365,9 @@ class JQueryPluginProjectPage(): jqpp.description = self.tree.xpath('//div[@class="block description"]/text()')[0].replace("\n"," ").replace("\t"," ") jqpp.download_link = self.tree.xpath('//a[@class="download"]/@href')[0] tags = self.tree.xpath('//a[@class="tag icon-tag"]/text()') + homepage = self.tree.path('//a[@class="other-link view-homepage"]/@href')[0] + #print("homepage") + #print(homepage) jqpp.tags = [] for tag in tags: jqpt = JQueryPluginTag(tag) @@ -190,7 +396,8 @@ class JQueryPluginProjectPage(): jqpl = JQueryPluginLicense(oss_license) jqpp.licenses.append(jqpl) except Exception as ex: - print("JQueryPluginProjectPage.project() ") + print(ex) + #print("JQueryPluginProjectPage.project() ") return jqpp @@ -199,7 +406,7 @@ class JQueryPluginProject(): ''' def save(self): logger = Logger.getInstance() - logger.info("Starting JQuery Plugin Project") + #logger.info("Starting JQuery Plugin Project") ''' Save the project and related entities associated to current batch ''' @@ -233,7 +440,7 @@ class JQueryPluginProject(): oss_license.id_project = self.id oss_license.save() except Exception as ex: - print("JQueryPluginProject.save() " + self.title + " - "+ str(ex)) + print((self.title + " - "+ str(ex))) class JQueryPluginTag(): ''' @@ -244,7 +451,7 @@ class JQueryPluginTag(): def save(self): logger = Logger.getInstance() - logger.info("Starting JQuery Plugin Tag") + #logger.info("Starting JQuery Plugin Tag") ''' Save a tag on the database; On crawler's database tags are not associated to a batch. Save the association to its project @@ -273,7 +480,7 @@ class JQueryPluginVersion(): def save(self): logger = Logger.getInstance() - logger.info("Starting JQuery Plugin Version") + #logger.info("Starting JQuery Plugin Version") ''' Save a version on the database; On crawler's database versions are not associated to a batch. Save the association to its project @@ -294,7 +501,7 @@ class JQueryPluginLicense(): def save(self): logger = Logger.getInstance() - logger.info("Starting JQuery Plugin License") + #logger.info("Starting JQuery Plugin License") ''' Save a license on the database; On crawler's database licenses are not associated to a batch. Save the association to its project @@ -320,7 +527,7 @@ class JQueryPluginMaintainer(): def save(self): logger = Logger.getInstance() - logger.info("Starting JQuery Plugin Maintainer") + #logger.info("Starting JQuery Plugin Maintainer") ''' Save a Maintainer on the database; On crawler's database Maintainers are not associated to a batch. Save the association to its project diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/__init__.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ee7493fc35b735f3e68c61662fb2377fae21f3a --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/__init__.py @@ -0,0 +1,36 @@ + +#pylint: disable-msg=C0103 +""" +doapfiend +========= + +U{http://trac.doapspace.org/doapfiend} + +Description +----------- +doapfiend is a command-line client and library for querying, creating and +displaying DOAP (Description of a Project) RDF profiles. + +doapfiend uses RDFAlchemy and rdflib to parse and serialize DOAP. + +Plugins +------- +Plugins can be written for editing DOAP, scraping websites and creating DOAP, +searching for DOAP in SPARQL endpoints, displaying DOAP in various formats such +as HTML etc. + + +""" + + +#Hack to get around warning in RDFAlchemy, bug filed upstream +import logging +log = logging.getLogger() +log.setLevel(logging.ERROR) + +__docformat__ = 'epytext' +__version__ = '0.3.3' +__author__ = 'Rob Cakebread ' +__copyright__ = '(C) 2007-2008 Rob Cakebread' +__license__ = 'BSD-2' + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/cli.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..1488d6feb378af49f11161a8e2641b075d8bfd54 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/cli.py @@ -0,0 +1,242 @@ + +# pylint: disable-msg=C0103 +''' + +cli.py +====== + +Command-line tool for querying, serializing and displaying DOAP + +Author: Rob Cakebread + +License : BSD-2 + +''' + +__docformat__ = 'epytext' +__revision__ = '$Revision: $'[11:-1].strip() + + +import sys +import logging +import optparse + +from doapfiend.plugins import load_plugins +from doapfiend.utils import COLOR +from doapfiend.__init__ import __version__ as VERSION +from doapfiend.doaplib import print_doap, follow_homepages, show_links + + +class DoapFiend(object): + + '''`DoapFiend` class''' + + def __init__(self): + '''Initialize attributes, set logger''' + self.doap = None + self.options = None + self.log = logging.getLogger('doapfiend') + self.log.addHandler(logging.StreamHandler()) + #Cache list of all plugins + self.plugins = list(load_plugins(others=True)) + self.serializer = None + + def get_plugin(self, method): + """ + Return plugin object if CLI option is activated and method exists + + @param method: name of plugin's method we're calling + @type method: string + + @returns: list of plugins with `method` + + """ + all_plugins = [] + for plugin_obj in self.plugins: + plugin = plugin_obj() + plugin.configure(self.options, None) + if plugin.enabled: + if not hasattr(plugin, method): + plugin = None + else: + all_plugins.append(plugin) + return all_plugins + + def set_log_level(self): + '''Set log level according to command-line options''' + if self.options.verbose: + self.log.setLevel(logging.INFO) + elif self.options.quiet: + self.log.setLevel(logging.ERROR) + elif self.options.debug: + self.log.setLevel(logging.DEBUG) + else: + self.log.setLevel(logging.WARN) + + def print_doap(self, doap_xml): + ''' + Print doap as n3, rdf/xml, plain text or using serialization plugin + + @param doap_xml: DOAP in RDF/XML serialization + @type doap_xml: text + + @rtype: None + @return: Just displays DOAP + + ''' + if self.options.write: + filename = self.options.write + else: + filename = None + print_doap(doap_xml, serializer=self.serializer, filename=filename, + color=not self.options.no_color) + + def get_search_plugin(self): + ''' + Return active search plugin callable + + @rtype: callable + @returns: A callable object that fetches for DOAP + ''' + plugins = self.get_plugin('search') + if len(plugins) == 1: + return plugins[0].search + + def run(self): + ''' + Run doapfiend command + + Find the active plugin that has a 'search' method and run it, + then output the DOAP with print_doap, using the active plugin + with a 'serializer' method. + + + @rtype: int + @returns: 0 success or 1 failure + + ''' + opt_parser = self.setup_opt_parser() + (self.options, remaining_args) = opt_parser.parse_args() + self.set_serializer() + if not self.serializer and remaining_args: + opt_parser.print_help() + return 1 + self.set_log_level() + + if self.options.doapfiend_version: + return doapfiend_version() + + if self.options.no_color: + for this in COLOR: + COLOR[this] = '\x1b[0m' + search_func = self.get_search_plugin() + if search_func: + doap_xml = search_func() + if doap_xml: + if self.options.follow: + #Search for additional DOAP by looking up all doap:homepage + #found and then print all found. This may be used if the + #DOAP you've found isn't rich enough or with FOAF, where a + #person lists multiple projects they are affiliated with + #and you want to find DOAP based on the Projec homepages + #found in FOAF. + self.print_doap(doap_xml) + return follow_homepages(doap_xml) + elif self.options.show_links: + return show_links(doap_xml) + else: + return self.print_doap(doap_xml) + else: + opt_parser.print_help() + return 1 + + def set_serializer(self): + ''' + Find all plugins that are enabled on the command-line and have a + `serialize` method. If none are enabled, default to plain text + ''' + plugins = self.get_plugin('serialize') + if len(plugins) == 0: + self.serializer = None + else: + #Choose first serializer in case they try more than one + self.serializer = plugins[0].serialize + + def setup_opt_parser(self): + ''' + Setup the optparser + + @rtype: opt_parser.OptionParser + @return: Option parser + + ''' + usage = 'usage: %prog [options]' + opt_parser = optparse.OptionParser(usage=usage) + group_search = optparse.OptionGroup(opt_parser, + 'Search options', + 'Options for searching for DOAP') + + opt_parser.add_option('--version', action='store_true', + dest='doapfiend_version', default=False, + help='Show doapfiend version and exit.') + + opt_parser.add_option('-P', '--http-proxy', action='store', + dest='proxy', default=False, + help='Specify http proxy URL if you use one.') + + group_output = optparse.OptionGroup(opt_parser, + 'Output options', + 'Choose these options to change default output behavior') + + group_output.add_option('--debug', action='store_true', + dest= 'debug', default=False, + help='Show debugging information') + + group_output.add_option('-f', '--follow-links', action='store_true', + dest='follow', default=False, + help='Search for and show additional DOAP.', + metavar='FILENAME') + + group_output.add_option('-s', '--show-links', action='store_true', + dest='show_links', default=False, + help='Search for and show links to additional DOAP.', + metavar='FILENAME') + + group_output.add_option('-w', '--write', action='store', + dest='write', default=False, + help='Write DOAP to a file instead of displaying it.', + metavar='FILENAME') + + group_output.add_option('-C', '--no-color', action='store_true', + dest='no_color', default=False, + help="Don't use color in output") + + group_output.add_option('-q', '--quiet', action='store_true', + dest='quiet', default=False, help="Show less output") + + group_output.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, help="Show more output") + + # add opts from plugins + for plugcls in self.plugins: + plug = plugcls() + plug.add_options(opt_parser, group_output, group_search) + opt_parser.add_option_group(group_search) + opt_parser.add_option_group(group_output) + return opt_parser + + +def doapfiend_version(): + '''Print doapfiend version''' + print VERSION + + +def main(): + '''Let's do it.''' + my_doapfiend = DoapFiend() + return my_doapfiend.run() + + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/doap2html.xsl b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/doap2html.xsl new file mode 100755 index 0000000000000000000000000000000000000000..ffb94d29d34f0a408c00383df6f219be8c8e87db --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/doap2html.xsl @@ -0,0 +1,186 @@ + + + + + + + + + + + + + <xsl:value-of select="doap:name/text()"/> + + + + +
+

Project:

+
+ +
+ + + + + + + + +
+ + + +
+ + + +

+ : + + + + + + + +

+
+ + + +

+ : + + + + + + + +

+
+ + + +
+

Release

+
+ +
+
+
+ + + +
+

Maintainer

+
+ +
+
+
+ + + +
+

Developer

+
+ +
+
+
+ + + +
+

Documenter

+
+ +
+
+
+ + + +
+

Translator

+
+ +
+
+
+ + + +
+

Tester

+
+ +
+
+
+ + + +
+

Helper

+
+ +
+
+
+ + + +
+

Repository

+
+ + +
+
+
+ + + +
+

Maker of DOAP Profile

+
+ +
+
+
+ +
diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/doaplib.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/doaplib.py new file mode 100644 index 0000000000000000000000000000000000000000..d537af1a7d0039217c6e0e9d45c5fd5db998d889 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/doaplib.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python +#pylint: disable-msg=C0103 + +""" + +Library for parsing, displaying, querying and serializing DOAP + +""" + +import sys +import logging +import xmlrpclib +from cStringIO import StringIO +from xml.sax._exceptions import SAXParseException + +from rdfalchemy import rdfSubject +from rdflib import ConjunctiveGraph, Namespace + +from doapfiend.utils import fetch_file +from doapfiend.model import Project +from doapfiend.plugins import load_plugins + +LOG = logging.getLogger('doapfiend') +XMLRPC_SERVER = xmlrpclib.ServerProxy('http://doapspace.org/xmlrpc/') +DOAP_NS = Namespace('http://usefulinc.com/ns/doap#') + + +def follow_homepages(rdf_xml): + ''' + If there is a 'doap:Project homepage' it will be looked up + on doapspace.org using get_by_homepage to find any other + DOAP. This is useful if we're looking at FOAF and a project + is mentioned by homepage. It can also be used on DOAP files + to search for additional DOAP files about the same project. + + @param rdf_xml: RDF serialized as XML + @type : string + + @rtype: int + @returns: 0 on sucess or 1 if there was no DOAP in the RDF + ''' + homepages = list(get_homepages(rdf_xml)) + nbr_homepage_urls = len(homepages) + if nbr_homepage_urls >= 1: + print_doap_by_homepages(homepages) + else: + print 'No DOAP found in that RDF.' + return 1 + + +def show_links(rdf): + ''' + If there is a 'doap:Project homepage' it will be looked up + on doapspace.org using get_by_homepage to find any other + DOAP. This is useful if we're looking at FOAF and a project + is mentioned by homepage. It can also be used on DOAP files + to search for additional DOAP files about the same project. + + @param rdf: RDF serialized as XML + @type : string + + @rtype: int + @returns: 0 on sucess or 1 if there was no DOAP in the RDF + ''' + homepages = list(get_homepages(rdf)) + nbr_homepage_urls = len(homepages) + if nbr_homepage_urls >= 1: + for hpage_url in homepages: + print "Found project homepage:", hpage_url + #Search for DOAP by project homepage. + hpages = query_by_homepage(hpage_url) + for _src, hpage_url in hpages: + print ' Found DOAP: ', hpage_url + else: + print 'No DOAP found in that RDF.' + return 1 + + +def print_doap_by_homepages(homepages): + ''' + Given a list of homepage URLs, search for DOAP for each and print + + @param homepages: Project homepage + @type : list + + @rtype: None + @returns: None + ''' + for hpage_url in homepages: + print "Found project homepage", hpage_url + #Search for DOAP by project homepage. There may be none, one or multiple + hpages = query_by_homepage(hpage_url) + for _src, hpage_url in hpages: + print 'Found DOAP at ', hpage_url + doap_xml = fetch_doap(hpage_url) + print_doap(doap_xml) + +def get_homepages(rdf, format='xml'): + ''' + Find all doap:homepage in RDF + + @param rdf: RDF + @type rdf: string + + @param format: Serialization format + @type format: string + + @rtype: generator + @returns: homepages + ''' + store = ConjunctiveGraph() + store.parse(StringIO(rdf), publicID=None, format=format) + if rdf_has_doap(store): + for _s, o in store.subject_objects(DOAP_NS["homepage"]): + yield(str(o)) + +def rdf_has_doap(store): + ''' + Returns True if triplestore has the DOAP namespace defined + + @param store: triplestore + @type store: rdflib ConjunctiveGraph + + @rtype: boolean + @returns: True if triplestore contains DOAP namespace + + ''' + for namespace in store.namespaces(): + if namespace[1] == DOAP_NS: + return True + +def load_graph(doap, format="xml", get_list=False): + ''' + Load a DOAP profile into a RDFAlchemy/rdflib graph + + Supports any serialization format rdflib can parse (xml, n3, etc.) + + @param doap: DOAP + @type doap: string + + @param format: Serialization format we're parsing + @type format: string + + @param get_list: Return list of Projects if True + @type doap: list + + @rtype: Project + @returns: a Project{rdfSubject} + + ''' + rdfSubject.db = ConjunctiveGraph() + try: + rdfSubject.db.parse(StringIO(doap), format) + except SAXParseException: + sys.stderr.write("Error: Can't parse RDF/XML.\n") + raise Exception("Error: Can't parse RDF/XML.") + # sys.exit(2) + #If a serializer works on an entire graph, it doesn't matter which + #Project instance we give it. This is true for N3, XML/RDF etc. + #The 'text' serializer, on the other hand, prints out a separate + #description for each Project found in a graph. This is useful for + #'arbitrary' RDF, or FOAF where there may be several Projects listed. + #Ideally exactly one Project should be specified in an .rdf file. + #In the future load_graph will probably always return a list and let the + #plugins determine what to do when there are more than one Project + #found. + if get_list: + LOG.debug("doaplib: list of Projects") + try: + projs = list(Project.ClassInstances()) + LOG.debug("Found %s Projects." % len(projs)) + if len(projs) == 0: + sys.stderr.write('No DOAP found in that RDF.\n') + return projs + except StopIteration: + sys.stderr.write('No DOAP found in that RDF.\n') + sys.exit(2) + + else: + try: + LOG.debug("doaplib: single Project") + return Project.ClassInstances().next() + except StopIteration: + sys.stderr.write('No DOAP found in that RDF.\n') + sys.exit(2) + sys.stderr.write('No DOAP found in that RDF.\n') + +def get_by_pkg_index(index, project_name, proxy=None): + ''' + Get DOAP for a package index project name + + Builtin indexes: + + - 'sf' SourceForge + - 'fm' Freshmeat + - 'py' Python Package Index + + Note there can be other package indexes available by + third party plugins. + + @param index: Package index two letter abbreviation + @type index: string + + @param project_name: project name + @type project_name: string + + @param proxy: Optional HTTP proxy URL + @type proxy: string + + @rtype: string + @return: text of file retrieved + + ''' + for plugin_obj in list(load_plugins()): + plugin = plugin_obj() + if hasattr(plugin, 'prefix'): + if plugin.prefix == index: + plugin.query = project_name + return plugin.search(proxy) + + +def query_by_homepage(url): + ''' + Get list of URL's for DOAP given a project's homepage. + The list can contain zero or multiple URLs. + + The return format is: + [(source, URL), (source, URL)...] + + 'source' is the two letter package index abbreviation or 'ex' for external. + 'external' meaning the DOAP was spidered on the web. + Possible package indexes: + + Current indexes: + + - 'sf' SourceForge + - 'fm' Freshmeat + - 'py' Python Package Index + - 'oh' Packages listed on Ohloh + + @param url: URL of homepage of a project + @type url: string + + @rtype: list + @return: A list of tuples containing URLs for DOAP found by homepage + + ''' + #Should check for env variable for alternate xmplrpc server for testing? + return XMLRPC_SERVER.query_by_homepage(url) + + +def print_doap(doap_xml, color=None, format='text', serializer=None, + filename=None): + ''' + Print DOAP as text, xml, or n3 etc. or to stdout or a file + A callable serializer object may be passed or a name of a serializer + plugin. + + @param doap_xml: DOAP profile in RDF/XML + @type doap_xml: string + + @param format: Serialization syntax formatter name + @type format: string + + @param serializer: Instance of a serializer + @type serializer: callable + + @param filename: Optional filename to write to + @type filename: string + + @return: `serializer` or 1 if invalid serialization request + + ''' + #If we were passed a callable serializer object use it, + #otherwise lookup serializer by name in list of plugins + if not serializer: + serializer = get_serializer(format) + if not serializer: + sys.stderr.write('Unknown serialization requested: %s\n' % format) + return 1 + + doap = serializer(doap_xml, color) + if filename: + try: + open(filename, 'w').write(doap.encode('utf-8')) + except UnicodeDecodeError: + open(filename, 'w').write(doap) + else: + print doap + + +def get_serializer(format): + ''' + Return a serializer instance given its name + + @param format: Name of serializer + @type format: string + + @rtype: function + @returns: Instance of a serializer + ''' + #Get all plugins with a `serialize` method + for plugin_obj in get_plugin('serialize'): + plugin = plugin_obj() + if plugin.name == format: + return plugin.serialize + + +def get_plugin(method): + """ + Return plugin object if `method` exists + + @param method: name of plugin's method we're calling + @type method: string + + @returns: list of plugins with `method` + + """ + all_plugins = [] + for plugin in load_plugins(): + #plugin().configure(None, None) + if not hasattr(plugin, method): + plugin = None + else: + all_plugins.append(plugin) + return all_plugins + + +def fetch_doap(url, proxy=None): + ''' + Fetch DOAP by its URL or filename + + @param url: URL of DOAP profile in RDF/XML serialization + @type url: string + + @rtype: text + @return: DOAP + ''' + return fetch_file(url, proxy) diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/lexers.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/lexers.py new file mode 100644 index 0000000000000000000000000000000000000000..e86a9a7599d61535919e0ec816ec4e437d0fd4c6 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/lexers.py @@ -0,0 +1,148 @@ + +#pylint: disable-msg=C0301 + +""" + pygments.lexers.sw + ================== + + Lexers for semantic web languages. + + :copyright: 2007 by Philip Cooper . + :license: BSD, see LICENSE for more details. +""" + +import re + +from pygments.lexer import RegexLexer, include, bygroups +from pygments.token import Text, Comment, Operator, Keyword, Name, Literal + + +__all__ = ['Notation3Lexer', 'SparqlLexer'] + + +class Notation3Lexer(RegexLexer): + """ + Lexer for the N3 / Turtle / NT + """ + name = 'N3' + aliases = ['n3', 'turtle'] + filenames = ['*.n3', '*.ttl', '*.NT'] + mimetypes = ['text/rdf+n3', 'application/x-turtle', 'application/n3'] + + tokens = { + 'comments': [ + (r'(\s*#.*)', Comment) + ], + 'root': [ + include('comments'), + (r'(\s*@(?:prefix|base|keywords)\s*)(\w*:\s+)?(<[^> ]*>\s*\.\s*)', + bygroups(Keyword,Name.Variable,Name.Namespace)), + (r'\s*(<[^>]*\>)', Name.Class, ('triple','predObj')), + (r'(\s*[a-zA-Z_:][a-zA-Z0-9\-_:]*\s)', + Name.Class, ('triple','predObj')), + (r'\s*\[\]\s*', Name.Class, ('triple','predObj')), + ], + 'triple' : [ + (r'\s*\.\s*', Text, '#pop') + ], + 'predObj': [ + include('comments'), + (r'(\s*[a-zA-Z_:][a-zA-Z0-9\-_:]*\b\s*)', Operator, 'object'), + (r'\s*(<[^>]*\>)', Operator, 'object'), + (r'\s*\]\s*', Text, '#pop'), + (r'(?=\s*\.\s*)', Keyword, '#pop'), + ], + 'objList': [ + (r'\s*\)', Text, '#pop'), + include('object') + ], + 'object': [ + (r'\s*\[', Text, 'predObj'), + (r'\s*<[^> ]*>', Name.Attribute), + (r'\s*("""(?:.|\n)*?""")(\@[a-z]{2-4}|\^\^?)?\s*', + bygroups(Literal.String,Text)), + (r'\s*".*?[^\\]"(?:\@[a-z]{2-4}|\^\^?)?\s*', + Literal.String), + (r'\s*[a-zA-Z0-9\-_\:]\s*', Name.Attribute), + (r'\s*\(', Text, 'objList'), + (r'\s*;\s*\n?', Text, '#pop'), + (r'(?=\s*\])', Text, '#pop'), + (r'(?=\s*\.)', Text, '#pop'), + ], + } + + +class SparqlLexer(RegexLexer): + """ + Lexer for SPARQL Not Complete + """ + name = 'SPARQL' + aliases = ['sparql'] + filenames = ['*.sparql'] + mimetypes = ['text/x-sql'] + flags = re.IGNORECASE + tokens = { + 'comments': [ + (r'(\s*#.*)', Comment) + ], + 'root': [ + include('comments'), + (r'(\s*(?:PREFIX|BASE)\s+)(\w*:\w*)?(\s*<[^> ]*>\s*)', + bygroups(Keyword,Name.Variable,Name.Namespace)), + (r'(\s*#.*)', Comment), + (r'((?:SELECT|ASK|CONSTRUCT|DESCRIBE)\s*(?:DISTINCT|REDUCED)?\s*)((?:\?[a-zA-Z0-9_-]+\s*)+|\*)(\s*)', + bygroups(Keyword,Name.Variable,Text)), + (r'(FROM\s*(?:NAMED)?)(\s*.*)', bygroups(Keyword,Text)), + (r'(WHERE)?\s*({)', bygroups(Keyword,Text), 'graph'), + (r'(LIMIT|OFFSET)(\s*[+-]?[0-9]+)', + bygroups(Keyword,Literal.String)), + ], + 'graph':[ + (r'\s*(<[^>]*\>)', Name.Class, ('triple','predObj')), + (r'(\s*[a-zA-Z_0-9\-]*:[a-zA-Z0-9\-_]*\s)', + Name.Class, ('triple','predObj')), + (r'(\s*\?[a-zA-Z0-9_-]*)', Name.Variable, ('triple','predObj')), + (r'\s*\[\]\s*', Name.Class, ('triple','predObj')), + (r'\s*(FILTER\s*)((?:regex)?\()',bygroups(Keyword,Text),'filterExp'), + (r'\s*}', Text, '#pop'), + ], + 'triple' : [ + (r'(?=\s*})', Text, '#pop'), + (r'\s*\.\s*', Text, '#pop'), + ], + 'predObj': [ + include('comments'), + (r'(\s*\?[a-zA-Z0-9_-]*\b\s*)', Name.Variable,'object'), + (r'(\s*[a-zA-Z_:][a-zA-Z0-9\-_:]*\b\s*)', Operator, 'object'), + (r'\s*(<[^>]*\>)', Operator, 'object'), + (r'\s*\]\s*', Text, '#pop'), + (r'(?=\s*\.\s*)', Keyword, '#pop'), + ], + 'objList': [ + (r'\s*\)', Text, '#pop'), + include('object'), + ], + 'object': [ + include('variable'), + (r'\s*\[', Text, 'predObj'), + (r'\s*<[^> ]*>', Name.Attribute), + (r'\s*("""(?:.|\n)*?""")(\@[a-z]{2-4}|\^\^?)?\s*', bygroups(Literal.String,Text)), + (r'\s*".*?[^\\]"(?:\@[a-z]{2-4}|\^\^?)?\s*', Literal.String), + (r'\s*[a-zA-Z0-9\-_\:]\s*', Name.Attribute), + (r'\s*\(', Text, 'objList'), + (r'\s*;\s*', Text, '#pop'), + (r'(?=\])', Text, '#pop'), + (r'(?=\.)', Text, '#pop'), + ], + 'variable':[ + (r'(\?[a-zA-Z0-9\-_]+\s*)', Name.Variable), + ], + 'filterExp':[ + include('variable'), + include('object'), + (r'\s*[+*/<>=~!%&|-]+\s*', Operator), + (r'\s*\)', Text, '#pop'), + ], + + } + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/model.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9b935b51461221f7a942e56d407c20f75a390272 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/model.py @@ -0,0 +1,83 @@ + +''' + +Model of a DOAP profile using RDFAlchemy + +''' + +from rdfalchemy import rdfSubject, rdfSingle, rdfMultiple +from rdfalchemy.orm import mapper +from rdflib import Namespace + +DOAP = Namespace("http://usefulinc.com/ns/doap#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") +DC = Namespace("http://purl.org/dc/elements/1.1/") + + +class Project(rdfSubject): + + """ + DOAP Project Class + """ + + rdf_type = DOAP.Project + + category = rdfMultiple(DOAP.category) + created = rdfSingle(DOAP.created) + shortname = rdfSingle(DOAP.shortname) + description = rdfMultiple(DOAP.description) + bug_database = rdfSingle(DOAP['bug-database']) + developer = rdfMultiple(DOAP.developer, range_type=FOAF.Person) + documenter = rdfMultiple(DOAP.documenter, range_type=FOAF.Person) + download_mirror = rdfMultiple(DOAP['downoad-mirror']) + download_page = rdfSingle(DOAP['download-page']) + helper = rdfMultiple(DOAP.helper, range_type=FOAF.Person) + homepage = rdfSingle(DOAP.homepage) + license = rdfMultiple(DOAP['license']) + maintainer = rdfMultiple(DOAP.maintainer, range_type=FOAF.Person) + developer = rdfMultiple(DOAP.developer, range_type=FOAF.Person) + translator = rdfMultiple(DOAP.translator, range_type=FOAF.Person) + helper = rdfMultiple(DOAP.helper, range_type=FOAF.Person) + tester = rdfMultiple(DOAP.tester, range_type=FOAF.Person) + documenter = rdfMultiple(DOAP.documenter, range_type=FOAF.Person) + module = rdfSingle(DOAP.module) + name = rdfSingle(DOAP.name) + old_homepage = rdfMultiple(DOAP['old-homepage']) + programming_language = rdfMultiple(DOAP['programming-language']) + releases = rdfMultiple(DOAP.release, range_type=DOAP.Version) + svn_repository = rdfSingle(DOAP.repository, 'svn_repository', + range_type=DOAP.SVNRepository) + cvs_repository = rdfSingle(DOAP.repository, 'cvs_repository', + range_type=DOAP.CVSRepository) + oper_sys = rdfMultiple(DOAP['os']) + screenshots = rdfMultiple(DOAP.screenshots) + shortdesc = rdfMultiple(DOAP.shortdesc) + tester = rdfMultiple(DOAP.tester, range_type=FOAF.Person) + translator = rdfMultiple(DOAP.translator, range_type=FOAF.Person) + wiki = rdfMultiple(DOAP.wiki) + +class Release(rdfSubject): + """A release class""" + rdf_type = DOAP.Version + revision = rdfSingle(DOAP.revision) + name = rdfSingle(DOAP.name) + created = rdfSingle(DOAP.created) + changelog = rdfSingle(DC.description) + file_releases = rdfMultiple(DOAP['file-release']) + +class SVNRepository(rdfSubject): + """Subversion repository classs""" + rdf_type = DOAP.SVNRepository + location = rdfSingle(DOAP.location) + svn_browse = rdfSingle(DOAP.browse) + +class CVSRepository(rdfSubject): + """CVS repository class""" + rdf_type = DOAP.CVSRepository + anon_root = rdfSingle(DOAP['anon-root']) + cvs_browse = rdfSingle(DOAP.browse) + module = rdfSingle(DOAP.module) + + +mapper(Project, Release, CVSRepository, SVNRepository) + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/__init__.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35ae07c88a703473dd933ed7b2f92b16774366b8 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/__init__.py @@ -0,0 +1,144 @@ + +# pylint: disable-msg=W0142,C0103 + + +""" +Writing Plugins +=============== + +doapfiend supports setuptools_ entry point plugins. + +There are two basic rules for plugins: + + - Plugin classes should subclass `doapfiend.plugins.Plugin`_. + - Plugins may implement any of the methods described in the class + PluginInterface in doapfiend.plugins.base. Please note that this class is for + documentary purposes only; plugins may not subclass PluginInterface. + +Setuptools: http://peak.telecommunity.com/DevCenter/setuptools +Doapfiend Plugins: http://trac.doapspace.org/doapfiend/wiki/DoapfiendPlugins + +Registering +----------- + +For doapfiend to find a plugin, it must be part of a package that uses +setuptools, and the plugin must be included in the entry points defined +in the setup.py for the package:: + + setup(name='Some plugin', + ... + entry_points = { + 'doapfiend.plugins': [ + 'someplugin = someplugin:SomePlugin' + ] + }, + ... + ) + +Once the package is installed with install or develop, doapfiend will be able +to load the plugin. + +Defining options +---------------- + +All plugins must implement the methods ``add_options(self, parser, env)`` +and ``configure(self, options, conf)``. Subclasses of doapfiend.plugins.Plugin +that want the standard options should call the superclass methods. + +doapfiend uses optparse.OptionParser from the standard library to parse +arguments. A plugin's ``add_options()`` method receives a parser +instance. It's good form for a plugin to use that instance only to add +additional arguments that take only long arguments (--like-this). Most +of doapfiend's built-in arguments get their default value from an environment +variable. This is a good practice because it allows options to be +utilized when run through some other means than the doapfiendtests script. + +A plugin's ``configure()`` method receives the parsed ``OptionParser`` options +object, as well as the current config object. Plugins should configure their +behavior based on the user-selected settings, and may raise exceptions +if the configured behavior is nonsensical. + +Logging +------- + +doapfiend uses the logging classes from the standard library. To enable users +to view debug messages easily, plugins should use ``logging.getLogger()`` to +acquire a logger in the ``doapfiend.plugins`` namespace. + +""" + +import logging +import pkg_resources +from warnings import warn +from inspect import isclass +from doapfiend.plugins.base import Plugin + +LOG = logging.getLogger('doapfiend') + +# +==== IMPORTANT ====+ +#If you add any builtin plugins in doapfiend.plugins you must add them +#to this list for them to be loaded. It's okay to add other Python modules +#in the doapfiend.plugins namespace, but they won't be recognized as a plugin +#unless listed here: + +builtin_plugins = ['url', 'homepage', 'n3', 'xml', 'text', 'sourceforge', + 'pypi', 'freshmeat', 'ohloh', 'fields'] + +def call_plugins(plugins, method, *arg, **kw): + """Call all method on plugins in list, that define it, with provided + arguments. The first response that is not None is returned. + """ + for plug in plugins: + func = getattr(plug, method, None) + if func is None: + continue + LOG.debug("call plugin %s: %s", plug.name, method) + result = func(*arg, **kw) + if result is not None: + return result + return None + +def load_plugins(builtin=True, others=True): + """Load plugins, either builtin, others, or both. + """ + loaded = [] + if builtin: + for name in builtin_plugins: + try: + parent = __import__(__name__, globals(), locals(), [name]) + #print name + pmod = getattr(parent, name) + for entry in dir(pmod): + obj = getattr(pmod, entry) + if (isclass(obj) + and issubclass(obj, Plugin) + and obj is not Plugin + and not obj in loaded): + #LOG.debug("load builtin plugin %s (%s)" % (name, obj)) + #print "load builtin plugin %s (%s)" % (name, obj) + yield obj + loaded.append(obj) + except KeyboardInterrupt: + raise + except Exception, e: + warn("Unable to load builtin plugin %s: %s" % (name, e), + RuntimeWarning) + for entry_point in pkg_resources.iter_entry_points('doapfiend.plugins'): + LOG.debug("load plugin %s" % entry_point) + try: + plugin = entry_point.load() + except KeyboardInterrupt: + raise + except Exception, err_msg: + # never want a plugin load to exit doapfiend + # but we can't log here because the logger is not yet + # configured + warn("Unable to load plugin %s: %s" % \ + (entry_point, err_msg), RuntimeWarning) + continue + if plugin.__module__.startswith('doapfiend.plugins'): + if builtin: + yield plugin + elif others: + yield plugin + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/base.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/base.py new file mode 100644 index 0000000000000000000000000000000000000000..3cd03d4a08557f80b6897580043f346d68e4db7a --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/base.py @@ -0,0 +1,78 @@ + +# pylint: disable-msg=W0201,W0511 +#XXX Attribute 'conf' defined outside __init__ + +""" + +Base plugin class +================= + +All plugins should inherit doapfiend.plugins.Plugin + +""" + + +import textwrap + +class Plugin(object): + """Base class for doapfiend plugins. It's not necessary to subclass this + class to create a plugin; however, all plugins must implement + `add_options(self, parser)` and `configure(self, options, + conf)`, and must have the attributes `enabled` and `name`. + + Plugins should not be enabled by default. + + Subclassing Plugin will give your plugin some friendly default + behavior: + + - A --with-$name option will be added to the command line + interface to enable the plugin. The plugin class's docstring + will be used as the help for this option. + - The plugin will not be enabled unless this option is selected by + the user. + """ + enabled = False + enable_opt = None + name = None + + def __init__(self): + self.conf = None + if self.name is None: + self.name = self.__class__.__name__.lower() + if self.enable_opt is None: + self.enable_opt = "enable_plugin_%s" % self.name + + def add_options(self, parser): + """Add command-line options for this plugin. + + The base plugin class adds --with-$name by default, used to enable the + plugin. + """ + parser.add_option("--with-%s" % self.name, + action="store_true", + dest=self.enable_opt, + help="Enable plugin %s: %s" % + (self.__class__.__name__, self.help()) + ) + + def configure(self, options, conf): + """Configure the plugin and system, based on selected options. + + The base plugin class sets the plugin to enabled if the enable option + for the plugin (self.enable_opt) is true. + """ + self.conf = conf + self.options = options + if hasattr(options, self.enable_opt): + self.enabled = getattr(options, self.enable_opt) + + def help(self): + """Return help for this plugin. This will be output as the help + section of the --with-$name option that enables the plugin. + """ + if self.__class__.__doc__: + # doc sections are often indented; compress the spaces + return textwrap.dedent(self.__class__.__doc__) + return "(no help available)" + + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/fields.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/fields.py new file mode 100644 index 0000000000000000000000000000000000000000..d462437a14a4e43c753571a6f0783aa429bc001e --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/fields.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 +""" + +Plain text serializer +===================== + +This plugin outputs DOAP in human-readable plain text + +""" + +__docformat__ = 'epytext' + +import logging + +from rdflib import Namespace +from rdfalchemy import rdfSubject + +from doapfiend.plugins.base import Plugin +from doapfiend.utils import COLOR +from doapfiend.doaplib import load_graph + + +FOAF = Namespace("http://xmlns.com/foaf/0.1/") + +LOG = logging.getLogger('doapfiend') + + +class OutputPlugin(Plugin): + + """Class for formatting DOAP output""" + + #This will be the opt_parser option (--fields) + name = "fields" + enabled = False + enable_opt = name + + def __init__(self): + '''Setup Plain Text OutputPlugin class''' + super(OutputPlugin, self).__init__() + self.options = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + output.add_option('--%s' % self.name, + action='store', + dest=self.enable_opt, + help='Output specific DOAP fields as plain text') + return parser, output, search + + def serialize(self, doap_xml, color=False): + ''' + Serialize RDF/XML DOAP as N3 syntax + + @param doap_xml: DOAP in RDF/XML serialization + @type doap_xml: string + + @rtype: unicode + @return: DOAP in plain text + ''' + if hasattr(self.options, 'no_color'): + color = not self.options.no_color + if not color: + #This has already been done if we're called from cli.py + #Fix me: Need to think on this. + for this in COLOR: + COLOR[this] = '\x1b[0m' + + if hasattr(self.options, 'quiet'): + brief = self.options.quiet + else: + brief = False + + doap = load_graph(doap_xml) + fields = self.options.fields.split(',') + #print fields + out = '' + for field in fields: + if '-' in field: + field = field.replace('-', '_') + field = field.strip() + if '.' in field: + repo, field = field.split('.') + text = print_repos(doap, repo, field) + elif field == 'releases': + text = get_releases(doap, brief) + elif field in ['maintainer', 'developer', 'documenter', 'helper', + 'tester', 'translator']: + text = get_people(doap, field) + else: + try: + text = getattr(doap, field) + except AttributeError: + LOG.warn("No such attribute: %s" % field) + text = None + if not text: + continue + if isinstance(text, list): + text = print_list(doap, field) + else: + text = print_field(doap, field) + out += text + '\n' + return out.rstrip() + +def print_list(doap, field): + ''' + Print list of DOAP attributes + + @param doap: DOAP in RDF/XML + @type doap: text + + @param field: DOAP attribute to be printed + @type field: text + + @rtype: text + @returns: Field to be printed + ''' + #Can have multiple values per attribute + text = "" + for thing in getattr(doap, field): + if isinstance(thing, rdfSubject): + text += thing.resUri + else: + #unicode object + thing = thing.strip() + text += thing + return text + +def print_field(doap, field): + ''' + Print single field + + @param doap: DOAP in RDF/XML + @type doap: text + + @param field: DOAP attribute to be printed + @type field: text + + @rtype: text + @returns: Field to be printed + ''' + text = getattr(doap, field) + if isinstance(text, rdfSubject): + return text.resUri.strip() + else: + return text.strip() + +def print_repos(doap, repo, field): + '''Prints DOAP repository metadata''' + if repo == 'cvs': + if hasattr(doap.cvs_repository, field): + return getattr(doap.cvs_repository, field) + + if repo == 'svn': + if field == 'browse': + field = 'svn_browse' + if hasattr(doap.svn_repository, field): + text = getattr(doap.svn_repository, field) + if text: + if isinstance(text, rdfSubject): + return text.resUri + else: + return text.strip() + return '' + +def get_people(doap, job): + '''Print people for a particular job ''' + out = '' + if hasattr(doap, job): + attribs = getattr(doap, job) + if len(attribs) > 0: + peeps = [] + for attr in attribs: + if attr[FOAF.mbox] is None: + person = "%s" % attr[FOAF.name] + else: + mbox = attr[FOAF.mbox].resUri + if mbox.startswith('mailto:'): + mbox = mbox[7:] + person = "%s <%s>" % (attr[FOAF.name], mbox) + else: + LOG.debug("mbox is invalid: %s" % mbox) + person = "%s" % attr[FOAF.name] + peeps.append(person) + out += ", ".join([p for p in peeps]) + return out + + +def get_releases(doap, brief=False): + '''Print DOAP package release metadata''' + out = '' + if hasattr(doap, 'releases') and len(doap.releases) != 0: + if not brief: + out += COLOR['bold'] + "Releases:" + COLOR['normal'] + '\n' + for release in doap.releases: + if release.name: + out += COLOR['bold'] + COLOR['cyan'] + release.name + \ + COLOR['normal'] + '\n' + if hasattr(release, 'created') and release.created is not None: + created = release.created + else: + created = '' + out += COLOR['cyan'] + ' ' + release.revision + ' ' + \ + COLOR['normal'] + created + '\n' + if not brief: + if hasattr(release, 'changelog'): + if release.changelog: + out += COLOR['yellow'] + release.changelog + \ + COLOR['normal'] + '\n' + + for frel in release.file_releases: + out += ' %s' % frel.resUri + '\n' + return out + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/freshmeat.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/freshmeat.py new file mode 100644 index 0000000000000000000000000000000000000000..d79a768c9fda03ed83c22be8d4a778211bafac27 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/freshmeat.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +freshmeat +========= + +Currently this plugin uses http://doapspace.org/ to fetch DOAP for Freshmeat + + +""" + +__docformat__ = 'epytext' + + +from doapfiend.utils import NotFoundError +from doapfiend.plugins.base import Plugin +from doapfiend.plugins.pkg_index import get_by_pkg_index + + +class FreshmeatPlugin(Plugin): + + """Get DOAP from Freshmeat package index""" + + #This will be the opt_parser option (--fm) in the output group + name = "fm" + enabled = False + enable_opt = name + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(FreshmeatPlugin, self).__init__() + self.options = None + self.query = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + search.add_option('--%s' % self.name, + action='store', + dest=self.enable_opt, + help='Get DOAP by its Freshmeat project name.', + metavar='PROJECT_NAME') + return parser, output, search + + def search(self, proxy=None): + ''' + Get Freshmeat DOAP + + @param proxy: URL of optional HTTP proxy + @type proxy: string + + @rtype: unicode + @returns: Single DOAP + + ''' + if hasattr(self.options, self.name): + self.query = getattr(self.options, self.name) + #Else self.query was set directly, someone not using the CLI + try: + return get_by_pkg_index(self.name, self.query, proxy) + except NotFoundError: + print "Not found: %s" % self.query + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/homepage.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/homepage.py new file mode 100644 index 0000000000000000000000000000000000000000..91b710288b5be9b5e56017f6c99e020e34692495 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/homepage.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +homepage +======== + +Fetches DOAP by searching doapspace.org by a project's homepage. + +""" + +__docformat__ = 'epytext' + +import logging + +from doapfiend.plugins.base import Plugin +from doapfiend.doaplib import fetch_doap, query_by_homepage + +LOG = logging.getLogger("doapfiend") + +class OutputPlugin(Plugin): + + """Class for formatting DOAP output""" + + #This will be the opt_parser option (--xml) in the output group + name = "homepage" + enabled = False + enable_opt = name + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(OutputPlugin, self).__init__() + self.options = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + search.add_option('-o', '--%s' % self.name, + action='store', + dest=self.enable_opt, + help="Search for DOAP by a project's homepage", + metavar='HOMEPAGE_URL') + return parser, output, search + + def search(self): + ''' + Get DOAP given a project's homepage + + @rtype: unicode + @return: DOAP + ''' + return do_search(self.options.homepage) + +def do_search(homepage): + ''' + Get DOAP given a project's homepage + + @param homepage: Project homepage URL + + @rtype: unicode + @return: DOAP + ''' + resp = query_by_homepage(homepage) + LOG.debug(resp) + if len(resp) == 0: + LOG.error("Not found: %s" % homepage) + return + elif len(resp) == 1: + url = resp[0][1] + else: + #Multiple, send warning and use first 'external' if any + LOG.warn("Warning: Multiple DOAP found.") + url = None + for this in resp: + LOG.warn(this) + if not url: + #Keep first one if there is no external DOAP + url = this[1] + if this[0] == 'ex': + url = this[1] + LOG.warn("Using %s" % url) + return fetch_doap(url) + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/n3.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/n3.py new file mode 100644 index 0000000000000000000000000000000000000000..88e25a2f129dbf6e1611b4ee4a80ec04bb6fb6fc --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/n3.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +Serializer for N3 (Notation 3) +============================== + +This is a plugin for formatting DOAP output as N3 (Notation 3) syntax. + +""" + +__docformat__ = 'epytext' + +import logging +from cStringIO import StringIO + +from rdflib import ConjunctiveGraph + +from doapfiend.plugins.base import Plugin + +LOG = logging.getLogger(__name__) + + +def get_n3(xml_text, color=False): + ''' + Return N3 (Notation 3) text + Note: Returns string for non-color and unicode for colored text + + @param xml_text: XML/RDF + @type xml_text: string + + @rtype: unicode or string + @return: DOAP in Notation 3 + ''' + store = ConjunctiveGraph() + graph = store.parse(StringIO(xml_text), publicID=None, format="xml") + notation3 = graph.serialize(format="n3") + + if color: + #pygments plugin fools pylint + # pylint: disable-msg=E0611 + try: + from pygments import highlight + from doapfiend.lexers import Notation3Lexer + from pygments.formatters import TerminalFormatter + except ImportError: + return notation3 + return highlight(notation3, + Notation3Lexer(), + TerminalFormatter(full=False)) + else: + return notation3 + +class OutputPlugin(Plugin): + + """Class for formatting DOAP output""" + + #This will be the opt_parser option (--n3) + name = "n3" + enabled = False + enable_opt = None + + def __init__(self): + '''Setup N3 OutputPlugin class''' + super(OutputPlugin, self).__init__() + self.options = None + + def serialize(self, doap_xml, color=False): + ''' + Serialize RDF/XML DOAP as N3 syntax + + @param doap_xml: DOAP in RDF/XML serialization + @type doap_xml: string + + @rtype: unicode + @return: DOAP in Notation 3 + ''' + if hasattr(self, 'options') and hasattr(self.options, 'no_color'): + color = not self.options.no_color + return get_n3(doap_xml, color) + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + output.add_option('-n', '--%s' % self.name, + action='store_true', + dest=self.enable_opt, + help='Output DOAP as Notation 3') + return parser, output, search + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/ohloh.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/ohloh.py new file mode 100644 index 0000000000000000000000000000000000000000..64a25f9985ecfe73303acfd2cc14863073355223 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/ohloh.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +ohloh +===== + +This plugin uses http://rdfohloh.wikier.org/ to fetch DOAP for +projects listed on Ohlohh. + + +""" + +__docformat__ = 'epytext' + + +from doapfiend.utils import NotFoundError +from doapfiend.plugins.base import Plugin +from doapfiend.plugins.pkg_index import get_by_pkg_index + + +class OhlohPlugin(Plugin): + + """Get DOAP from Ohloh package index""" + + #This will be the opt_parser option (--oh) in the output group + name = "oh" + enabled = False + enable_opt = name + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(OhlohPlugin, self).__init__() + self.options = None + self.query = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + search.add_option('--%s' % self.name, + action='store', + dest=self.enable_opt, + help='Get DOAP by its Ohloh project name or id #.', + metavar='PROJECT_NAME') + return parser, output, search + + def search(self, proxy=None): + ''' + Get Ohloh DOAP + + @param proxy: Option HTTP proxy URL + @type proxy: string + + @rtype: unicode + @returns: Single DOAP + + ''' + if hasattr(self.options, self.name): + self.query = getattr(self.options, self.name) + #Else self.query was set directly, someone not using the CLI + try: + return get_by_pkg_index(self.name, self.query, proxy) + except NotFoundError: + print "Not found: %s" % self.query + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/pkg_index.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/pkg_index.py new file mode 100644 index 0000000000000000000000000000000000000000..49050e38b07a84b1f9dc2181ff43517ebb7947a8 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/pkg_index.py @@ -0,0 +1,46 @@ + +''' + +Plugin helper to fetch a single DOAP file from doapspace.org +by Package Index + +''' + +from doapfiend.utils import fetch_file + +PKG_INDEX_URI = 'http://doapspace.org/doap' +OHLOH_URI = 'http://rdfohloh.wikier.org/project/' + + +def get_by_pkg_index(index, project_name, proxy=None): + ''' + Get DOAP for a package index project name from doapspace.org + + Builtin indexes: + + - 'sf' SourceForge + - 'fm' Freshmeat + - 'py' Python Package Index + - 'oh' Project listed on Ohlo + + Raises doaplib.utils.NotFound exception on HTTP 404 error + + @param index: Package index two letter abbreviation + @type index: string + + @param project_name: project name + @type project_name: string + + @param proxy: Optional HTTP proxy URL + @type proxy: string + + @rtype: string + @return: text of file retrieved + + ''' + if index == 'oh': + url = '%s/%s/rdf' % (OHLOH_URI, project_name) + else: + url = '%s/%s/%s' % (PKG_INDEX_URI, index, project_name) + return fetch_file(url, proxy) + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/pypi.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/pypi.py new file mode 100644 index 0000000000000000000000000000000000000000..5f0f4b17c5f1722cb5e29a9eaf0dbd7856b6dd83 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/pypi.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" +pypi +==== + +Currently this plugin uses http://doapspace.org/ to fetch DOAP for PyPI +(The Python Package Index) + +""" + +__docformat__ = 'epytext' + + +from doapfiend.utils import NotFoundError +from doapfiend.plugins.base import Plugin +from doapfiend.plugins.pkg_index import get_by_pkg_index + + +class PyPIPlugin(Plugin): + + """Get DOAP from PyPI package index""" + + #This will be the opt_parser option (--py) in the output group + name = 'py' + enabled = False + enable_opt = name + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(PyPIPlugin, self).__init__() + self.options = None + self.query = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + search.add_option('--%s' % self.name, + action='store', + dest=self.enable_opt, + help='Get DOAP by its PyPI project name.', + metavar='PROJECT_NAME') + return parser, output, search + + def search(self, proxy=None): + ''' + Get PyPI DOAP + + @param proxy: URL of optional HTTP proxy + @type proxy: string + + @rtype: unicode + @returns: Single DOAP + + ''' + if hasattr(self.options, self.name): + self.query = getattr(self.options, self.name) + #Else self.query was set directly, someone not using the CLI + try: + return get_by_pkg_index(self.name, self.query, proxy) + except NotFoundError: + print "Not found: %s" % self.query + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/sourceforge.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/sourceforge.py new file mode 100644 index 0000000000000000000000000000000000000000..09b063287126af1fa3ae8d43d0fec80ab04b6937 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/sourceforge.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +sourceforge +=========== + +Currently this plugin uses http://doapspace.org/ to fetch DOAP for SourceForge + + +""" + +__docformat__ = 'epytext' + + +from doapfiend.utils import NotFoundError +from doapfiend.plugins.base import Plugin +from doapfiend.plugins.pkg_index import get_by_pkg_index + + +class SourceForgePlugin(Plugin): + + """Get DOAP from SourceForge package index""" + + #This will be the opt_parser option (--sf) in the output group + name = "sf" + enabled = False + enable_opt = name + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(SourceForgePlugin, self).__init__() + self.options = None + self.query = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + search.add_option('--%s' % self.name, + action='store', + dest=self.enable_opt, + help='Get DOAP by its SourceForge project name.', + metavar='PROJECT_NAME') + return parser, output, search + + def search(self, proxy=None): + ''' + Get SourceForge DOAP + + @param proxy: Option HTTP proxy URL + @type proxy: string + + @rtype: unicode + @returns: Single DOAP + + ''' + if hasattr(self.options, self.name): + self.query = getattr(self.options, self.name) + #Else self.query was set directly, someone not using the CLI + try: + return get_by_pkg_index(self.name, self.query, proxy) + except NotFoundError: + print "Not found: %s" % self.query + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/text.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/text.py new file mode 100644 index 0000000000000000000000000000000000000000..83e6b48cdec45360fd97aaf99319e549514b41c3 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/text.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 +""" + +Plain text serializer +===================== + +This plugin outputs DOAP in human-readable plain text + +""" + +__docformat__ = 'epytext' + +import logging +import textwrap +from cStringIO import StringIO + +from rdflib import Namespace +from rdfalchemy import rdfSubject + +from doapfiend.plugins.base import Plugin +from doapfiend.utils import COLOR +from doapfiend.doaplib import load_graph + + +FOAF = Namespace("http://xmlns.com/foaf/0.1/") + +LOG = logging.getLogger(__name__) + + +class OutputPlugin(Plugin): + + """Class for formatting DOAP output""" + + #This will be the opt_parser option (--text) + name = "text" + enabled = False + enable_opt = None + + def __init__(self): + '''Setup Plain Text OutputPlugin class''' + super(OutputPlugin, self).__init__() + self.options = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + output.add_option('--%s' % self.name, + action='store_true', + dest=self.enable_opt, + help='Output DOAP as plain text (Default)') + return parser, output, search + + def serialize(self, doap_xml, color=False): + ''' + Serialize RDF/XML DOAP as plain text + + @param doap_xml: DOAP in RDF/XML serialization + @type doap_xml: string + + @rtype: unicode + @return: DOAP in plain text + ''' + if hasattr(self.options, 'no_color'): + color = not self.options.no_color + if not color: + #This has already been done if we're called from cli.py + #Fix me: Need to think on this. + for this in COLOR: + COLOR[this] = '\x1b[0m' + if hasattr(self.options, 'quiet'): + brief = self.options.quiet + else: + brief = False + + printer = DoapPrinter(load_graph(doap_xml, get_list=True), brief, color) + return printer.print_doap() + + +class DoapPrinter(object): + + '''Prints DOAP in human readable text''' + + def __init__(self, doap, brief=False, color=False): + '''Initialize attributes''' + self.brief = brief + self.doap_list = doap + self.doap = None + self.text = StringIO() + self.color = color + + def write(self, text): + ''' + Write to DOAP output file object + ''' + self.text.write(text.encode('utf-8') + '\n') + + def print_doap(self): + ''' + Serialize DOAP in human readable text, optionally colorized + + @rtype: unicode + @return: DOAP as plain text + ''' + for doap in self.doap_list: + self.doap = doap + self.print_misc() + if self.brief: + return + self.print_people() + self.print_repos() + self.print_releases() + doap = self.text.getvalue() + self.text.close() + return doap + + def print_misc(self): + '''Prints basic DOAP metadata''' + #We should be able to get this from model.py automatically, + #but this lets us print in the order we like. + #Maybe move this to that model.py so we don't forget to sync + #when the DOAP schema changes. + fields = ('name', 'shortname', 'homepage', 'shortdesc', + 'description', 'old_homepage', 'created', + 'download_mirror') + + fields_verbose = ('license', 'programming_language', + 'bug_database', 'screenshots', 'oper_sys', + 'wiki', 'download_page', 'mailing_list') + + for fld in fields: + self.print_field(fld) + if not self.brief: + for fld in fields_verbose: + self.print_field(fld) + + def print_repos(self): + '''Prints DOAP repository metadata''' + if hasattr(self.doap.cvs_repository, 'module') and \ + self.doap.cvs_repository.module is not None: + self.write(misc_field('CVS Module:', + self.doap.cvs_repository.module)) + self.write(misc_field('CVS Anon:', + self.doap.cvs_repository.anon_root)) + self.write(misc_field('CVS Browse:', + self.doap.cvs_repository.cvs_browse.resUri)) + + if hasattr(self.doap.svn_repository, 'location') and \ + self.doap.svn_repository.location is not None: + self.write(misc_field('SVN Location:', + self.doap.svn_repository.location.resUri)) + + if hasattr(self.doap.svn_repository, 'svn_browse') and \ + self.doap.svn_repository.svn_browse is not None: + self.write(misc_field('SVN Browse:', + self.doap.svn_repository.svn_browse.resUri)) + + def print_releases(self): + '''Print DOAP package release metadata''' + if hasattr(self.doap, 'releases') and len(self.doap.releases) != 0: + self.write(COLOR['bold'] + "Releases:" + COLOR['normal']) + for release in self.doap.releases: + if release.name: + self.write(COLOR['bold'] + COLOR['cyan'] + release.name + \ + COLOR['normal']) + if hasattr(release, 'created') and release.created is not None: + created = release.created + else: + created = '' + self.write(COLOR['cyan'] + ' ' + release.revision + ' ' + \ + COLOR['normal'] + created) + if hasattr(release, 'changelog'): + if release.changelog: + self.write(COLOR['yellow'] + \ + release.changelog + + COLOR['normal'] + ) + for frel in release.file_releases: + self.write(' %s' % frel.resUri) + + def print_people(self): + '''Print all people involved in the project''' + people = ['maintainer', 'developer', 'documenter', 'helper', + 'tester', 'translator'] + for job in people: + if hasattr(self.doap, job): + attribs = getattr(self.doap, job) + if len(attribs) > 0: + peeps = [] + for attr in attribs: + if attr[FOAF.mbox] is None: + person = "%s" % attr[FOAF.name] + else: + mbox = attr[FOAF.mbox].resUri + if mbox.startswith('mailto:'): + mbox = mbox[7:] + person = "%s <%s>" % (attr[FOAF.name], mbox) + else: + LOG.debug("mbox is invalid: %s" % mbox) + person = "%s" % attr[FOAF.name] + peeps.append(person) + label = job.capitalize() + "s:" + #label = label.ljust(13) + self.write(misc_field(label, + ", ".join([p for p in peeps]))) + + def print_field(self, name): + ''' + Print a DOAP element + + @param name: A misc DOAP element + @type name: string, list or RDFSubject + + @rtype: None + @return: Nothing + ''' + if not hasattr(self.doap, name): + return + attr = getattr(self.doap, name) + if attr is [] or attr is None: + return + + label = '%s' % COLOR['bold'] + pretty_name(name) + \ + COLOR['normal'] + ':' + label = label.ljust(21) + if isinstance(attr, list): + #Can have multiple values per attribute + text = "" + for thing in getattr(self.doap, name): + if isinstance(thing, rdfSubject): + text += thing.resUri + "\n" + else: + #unicode object + thing = thing.strip() + text += thing + "\n" + else: + text = getattr(self.doap, name) + if isinstance(text, rdfSubject): + text = text.resUri + else: + text = text.strip() + if text: + if text.startswith('http://'): + self.write('%s %s' % (label, text.strip())) + else: + self.write(textwrap.fill('%s %s' % (label, text), + initial_indent='', + subsequent_indent = ' ')) + + +def pretty_name(field): + """ + Convert DOAP element name to pretty printable label + Shorten some labels for formatting purposes + + @param field: Text to be formatted + @type field: C{string} + + @return: formatted string + @rtype: string + """ + if field == 'programming_language': + field = 'Prog. Lang.' + elif field == 'created': + field = 'DOAP Created' + else: + field = field.capitalize() + field = field.replace('_', ' ') + field = field.replace('-', ' ') + return field + + +def misc_field(label, text): + ''' + Print colorized and justified single label value pair + + @param label: A label + @type label: string + + @param text: Text to print + @type text: string + + @rtype: string + @return: Colorized, left-justified text with label + ''' + label = label.ljust(13) + label = COLOR['bold'] + label + COLOR['normal'] + return '%s %s' % (label, text) + + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/url.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/url.py new file mode 100644 index 0000000000000000000000000000000000000000..43b55b919828c0be955a3c9d4345e4e00596d10b --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/url.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +url.py +====== + +This plugin loads DOAP by its URL or path to a filename. + + +""" + +__docformat__ = 'epytext' + + +from doapfiend.plugins.base import Plugin +from doapfiend.utils import NotFoundError +from doapfiend.doaplib import fetch_doap + + +class UrlPlugin(Plugin): + + """Class for formatting DOAP output""" + + #This will be the opt_parser option (--url) in the 'search' group + name = 'url' + enabled = False + enable_opt = name + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(UrlPlugin, self).__init__() + self.options = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + search.add_option('-u', '--%s' % self.name, + action='store', + dest=self.enable_opt, + help='Get DOAP by its URL or by filename.', + metavar='URL') + return parser, output, search + + def search(self): + ''' + Get DOAP by its URL or file path + This can be any RDF as long as it has the DOAP namespace. + + @rtype: unicode + @return: DOAP + ''' + try: + return fetch_doap(self.options.url, self.options.proxy) + except NotFoundError: + print "Not found: %s" % self.options.url + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/xml.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/xml.py new file mode 100644 index 0000000000000000000000000000000000000000..4d084fb5c930df2c2b9592043439cbc9da260d84 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/plugins/xml.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +# pylint: disable-msg=W0221,R0201 + +""" + +Serialize DOAP as XML/RDF +========================= + +This plugin outputs DOAP in RDF/XML +It basically does nothing because all DOAP today is in RDF/XML. +In the future this may take N3, Turtle, RDFa etc. and convert it to RDF/XML. + +""" + +__docformat__ = 'epytext' + +from elementtree import ElementTree + +from doapfiend.plugins.base import Plugin + + +class OutputPlugin(Plugin): + + """Class for formatting DOAP output""" + + #This will be the opt_parser option (--xml) in the output group + name = "xml" + enabled = False + enable_opt = None + + def __init__(self): + '''Setup RDF/XML OutputPlugin class''' + super(OutputPlugin, self).__init__() + self.options = None + + def add_options(self, parser, output, search): + """Add plugin's options to doapfiend's opt parser""" + output.add_option('-x', '--%s' % self.name, + action='store_true', + dest=self.enable_opt, + help='Output DOAP as RDF/XML') + return parser, output, search + + def serialize(self, doap_xml, color=False): + ''' + Serialize RDF/XML DOAP as N3 syntax + + Since the only input we currently have is XML, all this really does + is parse the XML and raise an exception if it's invalid. + When we do content negotiation/accept N3 etc., this will serialize. + + @param doap_xml: DOAP in RDF/XML serialization + @type doap_xml: string + + @rtype: unicode + @returns: DOAP + ''' + #This will raise ExpatError if we have invalid XML + #(from xml.parsers.expat import ExpatError) + #We could format/pretty print here but we just return exactly what + #was fetched. + ElementTree.fromstring(doap_xml) + if hasattr(self.options, 'no_color'): + color = not self.options.no_color + if color: + #pygments plugin fools pylint + # pylint: disable-msg=E0611 + try: + from pygments import highlight + from pygments.lexers import XmlLexer + from pygments.formatters import TerminalFormatter + except ImportError: + return doap_xml + return highlight(doap_xml, + XmlLexer(), + TerminalFormatter(full=False)) + else: + return doap_xml + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/utils.py b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b727ae1a95fa533cd66910bbbea3d795b3c8cd52 --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/build/lib.linux-x86_64-2.7/doapfiend/utils.py @@ -0,0 +1,216 @@ + +""" + +utils.py +======== + +Misc utilities for doapfiend +---------------------------- + +General purpose helper functions and classes for doapfiend +You'll probably want to use doaplib for most cases. + +License: BSD-2 + +""" + +#pylint: disable-msg=C0103 + +import urllib +import logging +import urlparse +from httplib import HTTPConnection +from urllib2 import build_opener, HTTPError, ProxyHandler, URLError + + +__docformat__ = 'epytext' + +LOG = logging.getLogger('doapfiend') + +COLOR = {'normal': "\033[0m", + 'bold': "\033[1m", + 'underline': "\033[4m", + 'blink': "\033[5m", + 'reverse': "\033[7m", + 'black': "\033[30m", + 'red': "\033[31m", + 'green': "\033[32m", + 'yellow': "\033[33m", + 'blue': "\033[34m", + 'magenta': "\033[35m", + 'cyan': "\033[36m", + 'white': "\033[37m"} + + +class NotFoundError(Exception): + + '''DOAP not found''' + + #pylint: disable-msg=W0231 + def __init__(self, err_msg): + '''Initialize attributes''' + self.err_msg = err_msg + + def __str__(self): + return repr(self.err_msg) + + +def http_filesize(url): + """ + Get the size of file without downloading it. + bla bla bla + blaba + + @param url: URL of file + @type url: string + + @rtype: string + @return: Size of file + + Usage: + + >>> http_filesize('http://trac.doapspace.org/test_file.txt') + '160' + """ + + host, path = urlparse.urlsplit(url)[1:3] + if ':' in host: + # port specified, try to use it + host, port = host.split(':', 1) + try: + port = int(port) + except ValueError: + LOG.error('invalid port number %r' % port) + return False + else: + # no port specified, use default port + port = None + connection = HTTPConnection(host, port=port) + connection.request("HEAD", path) + resp = connection.getresponse() + return resp.getheader('content-length') + + +def http_exists(url): + """ + A quick way to check if a file exists on the web. + + @param url: URL of the document + @type url: string + @rtype: boolean + @return: True or False + + Usage: + + >>> http_exists('http://www.python.org/') + True + >>> http_exists('http://www.python.org/PenguinOnTheTelly') + False + """ + + host, path = urlparse.urlsplit(url)[1:3] + if ':' in host: + #port specified, try to use it + host, port = host.split(':', 1) + try: + port = int(port) + except ValueError: + LOG.error('invalid port number %r' % port) + return False + else: + #no port specified, use default port + port = None + connection = HTTPConnection(host, port=port) + connection.request("HEAD", path) + resp = connection.getresponse() + if resp.status == 200: # normal 'found' status + found = True + elif resp.status == 302: # recurse on temporary redirect + found = http_exists(urlparse.urljoin(url, + resp.getheader('location', ''))) + else: # everything else -> not found + LOG.info("Status %d %s : %s" % (resp.status, resp.reason, url)) + found = False + return found + + +def is_content_type(url_or_file, content_type): + """ + Tells whether the URL or pseudofile from urllib.urlopen is of + the required content type. + + @param url_or_file: URL or file path + @type url_or_file: string + @param content_type: Content type we're looking for + @type content_type: string + + @rtype: boolean + @returns: True if it can return the Content type we want + + Usage: + + >>> is_content_type('http://doapspace.org/doap/sf/nlyrics.rdf', \ + 'application/rdf+xml') + True + >>> is_content_type('http://doapspace.org/', 'application/rdf+xml') + False + """ + try: + if isinstance(url_or_file, str): + thefile = urllib.urlopen(url_or_file) + else: + thefile = url_or_file + result = thefile.info().gettype() == content_type.lower() + if thefile is not url_or_file: + thefile.close() + except IOError: + result = False + return result + + +def fetch_file(url, proxy=None): + ''' + Download file by URL + + @param url: URL of a file + @type url: string + + @param proxy: URL of HTTP Proxy + @type proxy: string + + @return: File + @rtype: string + + ''' + if not url.startswith('http://') and not url.startswith('ftp://'): + try: + return open(url, 'r').read() + except IOError, errmsg: + LOG.error(errmsg) + return '' + LOG.debug('Fetching ' + url) + if proxy: + opener = build_opener(ProxyHandler({'http': proxy})) + else: + opener = build_opener() + opener.addheaders = [('Accept', 'application/rdf+xml'), + ('User-agent', + 'Mozilla/5.0 (compatible; doapfiend ' + + 'http://trac.doapspace.org/doapfiend)')] + try: + result = opener.open(url) + except HTTPError, err_msg: + if err_msg.code == 404: + raise NotFoundError('Not found: %s' % url) + else: + LOG.error(err_msg) + except URLError, err_msg: + LOG.error(err_msg) + return + return result.read() + + +if __name__ == '__main__': + import doctest + doctest.testmod() + diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/dist/doapfiend-0.3.3-py2.7.egg b/web-crawler/lib/doapfiend/doapfiend-0.3.3/dist/doapfiend-0.3.3-py2.7.egg new file mode 100644 index 0000000000000000000000000000000000000000..e6532291024c3befc60d3ffacae276c5bc8785c4 Binary files /dev/null and b/web-crawler/lib/doapfiend/doapfiend-0.3.3/dist/doapfiend-0.3.3-py2.7.egg differ diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/doapfiend/.gitignore b/web-crawler/lib/doapfiend/doapfiend-0.3.3/doapfiend/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0479089c7f1d10e0a5b276b6db2b02727a040f8d --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/doapfiend/.gitignore @@ -0,0 +1 @@ +/__init__.pyc diff --git a/web-crawler/lib/doapfiend/doapfiend-0.3.3/tests/.gitignore b/web-crawler/lib/doapfiend/doapfiend-0.3.3/tests/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ebf21bc02f58bfc3dec82b3338e37118937d527d --- /dev/null +++ b/web-crawler/lib/doapfiend/doapfiend-0.3.3/tests/.gitignore @@ -0,0 +1,5 @@ +/test_cli.pyc +/test_doaplib.pyc +/test_n3.pyc +/test_utils.pyc +/test_xml.pyc diff --git a/web-crawler/r_forge_data_fetcher.py b/web-crawler/r_forge_data_fetcher.py index 122ece45ccbd124bf3f0f189294c5ea4792f40b1..d6433dae4c2f931ab25ac3f1c40836c816eb679c 100755 --- a/web-crawler/r_forge_data_fetcher.py +++ b/web-crawler/r_forge_data_fetcher.py @@ -2,24 +2,45 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # -# Copyright 2014 Bitergium SLL -# Author: Maria Di Girolamo +#Copyright 2014 Bitergium SLL + +#New Data Fetcher configured for MORPHEMIC Project release 2.0 +# Ownership : Engineering Ingegneria Informatica S.p.A. +# Author: Maria Antonietta Di Girolamo +# Year: 2020/2021 +#Maintainer:Maria Antonietta Di Girolamo from doapfiend.doaplib import load_graph from xml.dom.minidom import parse, parseString from Utils import Logger, Sources from pip import req +from bs4 import BeautifulSoup +from lxml.etree import tostring +from lxml import etree as et +from lxml import html, etree -import urllib2 +import urllib.request, urllib.error, urllib.parse import base64 +import ast +import re +import requests +# create a timestamp using the time() method +start_time = time.time() +# declare globals for the Elasticsearch client host +DOMAIN = "localhost" +PORT = 9200 +INDEX = "knowledgebase" +# concatenate a string for the client's host paramater +host = str(DOMAIN) + ":" + str(PORT) +# declare an instance of the Elasticsearch library +client = Elasticsearch(host) +#es_client used to create the knowldgebase dopp-model +es_client = Elasticsearch([{'host':str(DOMAIN), 'port':str(PORT)}]) -from bs4 import BeautifulSoup -from lxml.etree import tostring -from lxml import html class R_Forge_DataFetcher(): -#I progetti sono catalogati per : +#The projects are listed for : #

Browse By:

#
  • Development Status
  • #
  • Environment
  • @@ -30,10 +51,6 @@ class R_Forge_DataFetcher(): #
  • Operating System
  • #
  • Programming Language
  • #
  • Topic
  • form_cat=55 per Topic -#Creo un array con tutti i cataloghi della lista. - -#As repoterd in the link: -#Exports Available #R-Forge data is exported in a variety of standard formats. Many of the export URLs can also accept form/get data to customize the output. All data generated by these pages is realtime. #RSS/XML Exports #News Data @@ -45,62 +62,74 @@ class R_Forge_DataFetcher(): #R-Forge Full Project Listing (RSS 0.91, ) #R-Forge Full Project Listing (RSS 2.0) #R-Forge Trove Categories Tree (XML,) -#noi useremo R-Forge Trove Categories Tree +#WE used R-Forge Trove Categories Tree def __init__(self): - print "Starting R_Forge....." - #response = urllib2.urlopen('https://r-forge.r-project.org/export/trove_tree.php') + #print("Starting R_Forge.....") + response = urllib.request.urlopen('https://r-forge.r-project.org/export/trove_tree.php') - #print "after reading list of doap files " - #doc = response.read() + #print "after reading list of doap files " + doc = response.read() + ''' url_link = 'https://r-forge.r-project.org/softwaremap/trove_list.php?cat=c&form_cat=0' - print url_link r_html = urllib2.urlopen(url_link) - #print "after reading list of doap files " self.tree = html.fromstring(r_html.read()) r_category = self.tree.xpath('//li[@class="current-cat"]/text()') - r_project = self.tree.xpath('//ul/li/a/text()') - r_em = self.tree.xpath('//li/em/text()') - print r_category - print r_project - print r_em - + r_project = self.tree.xpath('//ul/li/a/text()') + r_subproject_forcategory = self.tree.xpath('//tr[@class="top"]/td[@id="project-tree-col1"]/ul/li/a') + #print("Project for Category ",r_category, "is :") + i = 0 + for i in r_category: + print("Project for Category ",i, "is :") + for child in r_subproject_forcategory: + cat_link = child.attrib['href'] + print(child.text, " with link :", "https://r-forge.r-project.org/softwaremap/" + cat_link) + ''' + - #print doc - -""" - soup = BeautifulSoup(doc,"lxml") - #print(soup.prettify()) - #content = soup.find(id="0") - #print content + cat = [] id_parent = [] - print "id_child[ " + #print "id_child[ " + id_child = [] + ''' for c in soup.find_all('category'): + k = c.get('id') id_parent.append(c.get('id')) - id_child = [] - for child in id_parent: - id_child.append(child) - print id_child - print " ] " - print "id parent " - print id_parent - print "id child " - print id_child - """ - # for l in id: + id_child.append(c.get('name')) + #print c.get('id') + ''' + url_link = 'https://r-forge.r-project.org/softwaremap/trove_list.php?cat=c&form_cat=0' #+ k + #print k + r_html = urllib.request.urlopen(url_link) + #print "after reading list of doap files " + self.tree = html.fromstring(r_html.read()) + r_subproject_forcategory = self.tree.xpath('//tr[@class="top"]/td[@id="project-tree-col1"]/ul/li/a/text()') + print(r_subproject_forcategory) + r_forcategory = self.tree.xpath('//tr[@class="top"]/td[@id="project-tree-col1"]/ul/li/a') + for child in r_forcategory: + cat_link = child.attrib['href'] + print cat_link + print(( 'Child of Topics' ,child.text, " with link :", "https://r-forge.r-project.org/softwaremap/" + cat_link)) + + #id_child.append(c.get('id')) + #print cat + #print id_parent + #print id_child + #for l in id_child: #print l #for k in l: - #td id="project-tree-col1" - #class=project-tree-branches - # url_link = 'https://r-forge.r-project.org/softwaremap/trove_list.php?cat=c&form_cat=' + k - # print url_link - # r_html = urllib2.urlopen(url_link) - #print "after reading list of doap files " - # self.tree = html.fromstring(r_html.read()) - # r_category = self.tree.xpath('//p/strong/text()') - # print r_category - # r_project = self.tree.xpath('//li[@class="current-cat"]/text()') - # print r_project - - + ''' + for k in c.get('id'): + id="project-tree-col1" + url_link = 'https://r-forge.r-project.org/softwaremap/trove_list.php?cat=c&form_cat=' + k + #print k + r_html = urllib2.urlopen(url_link) + #print "after reading list of doap files " + self.tree = html.fromstring(r_html.read()) + r_subproject_forcategory = self.tree.xpath('//tr[@class="top"]/td[@id="project-tree-col1"]/ul/li/a') + #for child in r_subproject_forcategory: + #print child.text + #cat_link = child.attrib['href'] + #print('k ', k ,' Child ' ,child.text, " with link :", "https://r-forge.r-project.org/softwaremap/" + cat_link) + ''' R_Forge_DataFetcher() \ No newline at end of file diff --git a/web-crawler/startCrawler b/web-crawler/startCrawler index c4d38d4825f758aff2ea385eb355210bb8ec062b..dfcd3dbc88fa3e100bba5fbdb12c9b7f7def7e07 100755 --- a/web-crawler/startCrawler +++ b/web-crawler/startCrawler @@ -5,9 +5,5 @@ # Copyright 2014 Bitergium SLL -python Orchestrator.py & -#python RepositoryCrawlerThread.py & -python NotifierThread.py & +python3 Orchestrator.py echo Starting Crawler: find log path and other configuration in ./config -#cd api -#pserve api.ini --reload \ No newline at end of file diff --git a/web-crawler/stopCrawler b/web-crawler/stopCrawler index 1fdb6c4187edc448b43d006286ad781c0b133187..af81f56e573071efc75f9dfa59916fc1842a3b6b 100755 --- a/web-crawler/stopCrawler +++ b/web-crawler/stopCrawler @@ -1,6 +1,5 @@ #/bin/bash -for proc in "pserve api.ini --reload" "python Orchestrator.py" "python NotifierThread.py"; do -#for proc in "pserve api.ini --reload" "python Orchestrator.py"; do +for proc in "python3 Orchestrator.py"; do ppid=$(pgrep -f "$proc") if [[ "$ppid" -ne "" ]]; then echo "killing process $proc with pid $ppid"