Commit 7ac51b5c authored by Marta Różańska's avatar Marta Różańska
Browse files

Merge branch 'doap-model-metadata' into 'morphemic-rc1.5'

porting from Python2.7 to Python 3.9.6 of the Web-Crawler and configuration of Apache and jQuery as SoftwareRepositories

See merge request !146
parents 35783879 217f929b
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>web-crawler</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
encoding/testAPI.py=utf-8
......@@ -120,7 +120,7 @@ class CrawlerDatabase:
else:
parameters = Utf8Helper.encode_parameters(parameters)
cur.execute(sqlString, parameters)
except Exception, e:
except Exception as e:
logger.error("SQL = " + sqlString + " PARAMETERS = " + str(parameters))
logger.error(str(e))
if commit:
......@@ -135,8 +135,8 @@ class Utf8Helper():
this method is used to encode parameters
Some chars >=128 end up in the string so I need to remove the to prevent exceptions
'''
for k in parameters.keys():
if type(parameters[k]) == unicode or type(parameters[k]) == etree._ElementUnicodeResult:
for k in list(parameters.keys()):
if type(parameters[k]) == str or type(parameters[k]) == etree._ElementUnicodeResult:
safe_string = StringHelper.removeNonAscii(parameters[k]).encode('utf8')
#logger.info("Utf8Helper.encode_parameters has encoded: " + safe_string)
parameters[k] = safe_string
......
......@@ -4,22 +4,25 @@
#
# Copyright 2014 Bitergium SLL
#February 2021
#Owner Engineering Ingegneria Informatica S.p.A.
#Update the code for the MORPHEMIC release 1.5
#Updated on February 2021 for MORPHEMIC project by Maria Antonietta Di Girolamo
# Ownership : Engineering Ingegneria Informatica S.p.A.
# Author: Maria Antonietta Di Girolamo
# Year: 2020/2021
#Maintainer:Maria Antonietta Di Girolamo
from dw_batch import DwBatch
from doap_project import DoapProject, DoapRepository, DoapVersion, FoafPerson
#from doap_project import DoapProject, DoapRepository, DoapVersion, FoafPerson
from Utils import Logger, TrivialJSONEncoder, Configuration, DateHelper, Sources, StringList
from github_data_fetcher import GithubArchiveStats
from doapfiend.doaplib import load_graph
from doapfiend.model import Project as doap
#from doapfiend.doaplib import load_graph
#from doapfiend.model import Project as doap
from CrawlerDatabase import CrawlerDatabase
import base64
import site
import sys
import os, stat
import urllib3
import json2html
#import json2html
import requests
import decimal
import json
......
......@@ -4,20 +4,24 @@
#
# Copyright 2014 Bitergium SLL
#February 2021
#Owner Engineering Ingegneria Informatica S.p.A.
#Update the code for the MORPHEMIC release 1.5
#Updated on February 2021 for MOPRHEMIC project release 1.0
# Ownership : Engineering Ingegneria Informatica S.p.A.
# Author: Maria Antonietta Di Girolamo
# Year: 2020/2021
#Maintainer:Maria Antonietta Di Girolamo
#Last updated on July2021
import time
from Integrator import ApacheIntegrator, GithubIntegrator, JQueryPluginIntegrator
from apache_data_fetcher import ApacheDataFetcher
from jquery_plugin_data_fetcher import JQueryPluginDataFetcher
from Utils import Configuration, Logger, Sources
from CrawlerDatabase import CrawlerDatabase
from RepositoryCrawler import RepositoryCrawler
#from RepositoryCrawler import RepositoryCrawler
from generic_data_fetcher import GenericDataFetcher
from github_data_fetcher import GitHubDataFetcher
#from r_forge_data_fetcher import R_Forge_DataFetcher
import traceback
import time
class Orchestrator():
......@@ -38,63 +42,50 @@ class Orchestrator():
logger.info("Waking up after " + str(Configuration.sleep_time) + " seconds.")
Configuration()
if Configuration.exit_now:
logger.info("Exiting Orchestrator as Configuration.exit_now is True.")
break
logger.info("Exiting Orchestrator as Configuration.exit_now is True.")
break
def iteration(self):
logger = Logger.getInstance()
logger.info("Starting retrieving of the metadata project") ## DEBUG!!
#logger.info("Starting fetch the projects open source") ## DEBUG!!
start = time.time()
#gh = GitHubDataFetcher()
#while True:
#rc = RepositoryCrawler()
#rc = RepositoryCrawler()
try:
#gh.run()
#rc.run()
#end = time.time()
#print(('Time taken to load github fetcher into crawler db is : ', str(end - start)))
if Configuration.github_every_n_days > 0:
gh = GitHubDataFetcher()
gh.run()
gh.batch.complete()
end = time.time()
print('Time taken to load github fetcher into crawler db is : ', str(end - start))
#logger.info("Fetching data from GitHubArchive through RepositoryCrawler")
'''
start = time.time()
ghi = GithubIntegrator()
if hasattr(ghi, 'batch') and not (ghi.batch is None):
ghi.integrate()
ghi.limitBatchLength()
end = time.time()
print('Time taken to load into MetadataProject table is : ', str(end - start))
'''
# APACHE
if Configuration.apache_every_n_days > 0:
logger.info("dentro githubveryndays")
gh = GitHubDataFetcher()
#logger.info("prima di run githubdatafetcher")
gh.run()
logger.info("dopo github.run")
gh.batch.complete()
end = time.time()
print('Time taken to load github fetcher into crawler db is : ', str(end - start))
if Configuration.apache_every_n_days > 0:
logger.info("Starting ApacheDataFetcher")
adf = ApacheDataFetcher()
#logger.info("ci arrivi qu iprima del run?")
adf.run()
adf.batch.complete()
ai = ApacheIntegrator()
if hasattr(ai, 'batch') and not (ai.batch is None):
ai.integrate()
ai.limitBatchLength()
#JQueryPlugin
if Configuration.jqueryplugin_every_n_days > 0:
#adf.batch.complete()
end = time.time()
print('Time taken to load apache fetcher into crawler db is : ', str(end - start))
if Configuration.jqueryplugin_every_n_days > 0:
logger.info("Starting JQueryPluginDataFetcher")
jq = JQueryPluginDataFetcher(Sources.JQueryPlugin)
jq = JQueryPluginDataFetcher()
jq.run()
jq.batch.complete()
jqi = JQueryPluginIntegrator()
if hasattr(jqi, 'batch') and not (jqi.batch is None):
jqi.integrate()
jqi.limitBatchLength()
except Exception, ex:
except Exception as ex:
traceback.print_exc()
#logging.error(str(ex))
logger.info(str(ex))
Orchestrator()
#logger.info(str(ex))
Orchestrator()
\ No newline at end of file
......@@ -5,7 +5,12 @@
# Copyright 2014 Bitergium SLL
MARKOS Crawler has been realized for the MARKOS EU Project (2003-2005)
MARKOS Crawler is updated for the MORPHEMIC project by ENG team.
create under hme directory : $HOME/markos/markos02/github
Example: /home/ubuntu/markos/markos02/github
On july 2020 the flossmole forge has been dismissed , so the ENG team
decide to dismiss flossmole from MARKOS Crawler for the MORPHEMIC project.
\ No newline at end of file
decide to dismiss flossmole from the Web-Crawler for the MORPHEMIC project.
On March2021 the creation of the doap model is separated from WEb-Crawler and added into a new
service of the MORPHEMIC project (called KnowledgeBase).
On May 2021 dismissed the Rest API developed by Engineering for the management of the communication with the other service of the
Markos project and uploaded with the ElasticSearch client.
New RESTful API used by KnowledgeBase to communicate with the Web-Crawler.
On July2021 Web-Crawler code is updated with a new, stable version of the Python -> 3.9.6
......@@ -16,18 +16,9 @@ import traceback
import time
from generic_data_fetcher import GenericDataFetcher
from __builtin__ import False
from webob import Response, exc,request
from cornice import Service, validators
import uuid
import sys
from pyramid.renderers import render
from webob.request import Request
sys.path.append('..')
crawler = Service(name='sendmessage', path='/crawler', description="send message")
from github_data_fetcher import GitHubDataFetcher
from apache_data_fetcher import ApacheDataFetcher
from jquery_plugin_data_fetcher import JQueryPluginDataFetcher
class RepositoryCrawler(GenericDataFetcher):
'''
......@@ -37,232 +28,46 @@ class RepositoryCrawler(GenericDataFetcher):
super(RepositoryCrawler, self).__init__(Sources.Github)
def run(self):
logger = Logger.getInstance()
'''
'''
try:
start = time.time()
if Configuration.github_every_n_days > 0:
logger.info("dentro githubveryndays")
gh = GitHubDataFetcher()
#logger.info("prima di run githubdatafetcher")
gh.run()
logger.info("dopo github.run")
gh.batch.complete()
end = time.time()
print(('Time taken to load github fetcher into crawler db is : ', str(end - start)))
if Configuration.apache_every_n_days > 0:
logger.info("Starting ApacheDataFetcher")
adf = ApacheDataFetcher()
df.run()
#adf.batch.complete()
end = time.time()
print(('Time taken to load apache fetcher into crawler db is : ', str(end - start)))
#JQueryPlugin
if Configuration.jqueryplugin_every_n_days > 0:
logger.info("Starting JQueryPluginDataFetcher")
jq = JQueryPluginDataFetcher()
jq.run()
jq.batch.complete()
print(('Time taken to load apache fetcher into crawler db is : ', str(end - start)))
sleep_time = Configuration.repository_crawler_sleep_time
logger.info("Repository Crawler about to sleep for " + str(sleep_time) + " seconds.")
time.sleep(sleep_time)
# to avoid 2006 'MySQL server has gone away' issue
CrawlerDatabase.connect()
logger.info("Repository Crawler waking up after " + str(sleep_time) + " seconds.")
months_of_stats = 0
dt = date.today()
while months_of_stats < Configuration.github_archive_months:
dt1 = dt.replace(day=1) #go to first day of month
dt = dt1 - timedelta(days=1) #back one day so I get previous month
year_minus_cursor = dt.year
month_minus_cursor = dt.month
gas = GithubArchiveStats(year_minus_cursor, month_minus_cursor)
#do I have data for this month
parameters = {
'stat_year': year_minus_cursor,
'stat_month': month_minus_cursor
}
if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0:
months_of_stats = months_of_stats + 1
elif gas.filesAvailable():
gas.fetchFiles()
gas.processFiles()
months_of_stats = months_of_stats + 1
#process donwloaded file finisched
#Crawler send a message to knowledgebase : ready
#self.send_message()
class GithubArchiveStats():
"""
We fetch statistics from http://www.githubarchive.org/
an instance is taking care of a specific month in a year
"""
def __init__(self, y, m):
self.y = y
self.m = m
self.mm = "%02d" % m
#self.idDWBatch = self.batch.id_batch
dt = date.today()
dt1 = dt.replace(day=1) #go to first day of month
dt = dt1 - timedelta(days=31) #add 31 days so I go to next month
self.mm_next_month = "%02d" % dt.month
self.yyyy_next_month = "%02d" % dt.year
@staticmethod
def statsAvailable():
"""
Returns true if there are N months of statistics in the local database out of the last N+1 months
where N = Configuration.github_archive_months
we look back N+1 months because testermonth's statistics will not be ready the during the first days
of the month; so it is ok to have the last N available even if yestermonth is not there
"""
logger = Logger.getInstance()
months_of_stats = 0
how_many = 0
date_cursor = date.today()
while months_of_stats <= Configuration.github_archive_months:
dt1 = date_cursor.replace(day=1) #go to first day of month
date_cursor = dt1 - timedelta(days=1) #back one day so I get previous month
year_minus_cursor = date_cursor.year
month_minus_cursor = date_cursor.month
#do I have data for this month
parameters = {
'stat_year': year_minus_cursor,
'stat_month': month_minus_cursor
}
if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s", parameters) > 0:
how_many = how_many + 1
months_of_stats = months_of_stats + 1
logger.debug("GithubArchiveStats.statsAvailable: Do we have any stats to process?" + str(how_many >= Configuration.github_archive_months)) ## DEBUG
return how_many >= Configuration.github_archive_months
def filesAvailable(self):
"""
Are files available at http://data.githubarchive.org
I assume all files for a month are available if first file of next month is available
"""
proc = Popen("wget http://data.githubarchive.org/" + self.yyyy_next_month + "-" + self.mm_next_month + "-01-0.json.gz", shell=True, cwd=Configuration.temporary_directory)
return_code = proc.wait()
if return_code == 8:
return False
return True
def fetchFiles(self):
"""
Files are per hour with name: YEAR-MONTH-DAY-HOUR.json.gz
"""
logger = Logger.getInstance()
#Pavia: for day_iter in range(1, monthrange(self.y, self.m)[1] + 1): #number of days in this month
for day_iter in range(1, 3): #number of days in this month
#Pavia: for hour_iter in range(24):
for hour_iter in range(10, 12):
sz_day = "%02d" % day_iter
sz_hour = str(hour_iter)
if not os.path.isfile(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz"):
proc = Popen("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz", shell=True, cwd=Configuration.github_file_path + "/gh")
return_code = proc.wait()
if return_code == 8:
logger.error("wget http://data.githubarchive.org/" + str(self.y) + "-" + self.mm + "-" + sz_day + "-" + sz_hour + ".json.gz" + " returned error code 8")
def processFiles(self):
logger = Logger.getInstance()
compressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json.gz")
for compressed_file in compressed_files:
proc = Popen("gunzip " + compressed_file, shell=True, cwd=Configuration.github_file_path + "/gh")
return_code = proc.wait()
uncompressed_files = glob.glob(Configuration.github_file_path + "/gh/" + str(self.y) + "-" + self.mm + "*.json")
for uncompressed_file in uncompressed_files:
with open(uncompressed_file) as f:
content = f.readlines()
for line in content:
try:
decoded = json.loads(line)
# GistEvent lines have no repository
if decoded["type"] != "GistEvent" : #not interested in Gists
#To speed up testing restrict to ReleaseEvent
#if decoded["type"] == "ReleaseEvent":
repo = decoded["repo"]
logger.debug("Parsing event type: " + decoded["type"] + " from project: " + repo["name"])
try:
if decoded["type"] == "RepositoryEvent" and ( decoded["action"] == "created" or decoded["action"] == "edited" or decoded["action"] == "renamed" ):
try:
project_description = decoded["description"]
logger.debug("Found description:" + project_description + " for project: " + repo["name"])
except:
project_description = ""
else:
project_description = ""
#print("DEBUG!! processfiles ")
#time.sleep(1.5)
'''
if decoded["type"] == "PullRequestEvent" :
payload = decoded["payload"]
pull_request = payload["pull_request"]
deployments_url = pull_request["deployments_url"]
license = pull_request["license"]
language = pull_request["language"]
logger.debug("deploy " + deployment_url + " license " + license + " language " + language)
'''
#Pavia: in questo pezzo di codice incrementa gli eventi relativi ad un progetto gia' conosciuto per il periodo preso in considerazione nelle statistiche
#se in precedenza abbiamo trovato una descrizione del progetto aggiorna il relativo campo
parameters = {
'project_name': str(repo["name"]),
'description': str(project_description),
'stat_year': self.y,
'stat_month': self.m,
}
#print "parameters"
#print(str(parameters))
#print "str(idDWBatch)"
#print str(self.idDWBatch)
#print("SELECT count(*) from rc_gh_archive WHERE project_name = '" + str(repo["name"]) + "' AND idGhProject >0 ")
#if CrawlerDatabase.select_int("SELECT count(*) from rc_gh_archive WHERE project_name = '" + str(repo["name"]) + "' AND idGhProject >0 ") > 0 :
#Mria February 2021 limitiamo il dowmload dei progetti da GitHub solo per ReleaseEvent
if decoded["type"] == "ReleaseEvent":
if CrawlerDatabase.select_int("SELECT COUNT(*) FROM rc_gh_archive WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", parameters) > 0:
if parameters['description'] == "": #if description is empty I do not overwrite it as it might have been there in other events
CrawlerDatabase.execute("UPDATE rc_gh_archive SET event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", False, parameters)
else:
CrawlerDatabase.execute("UPDATE rc_gh_archive SET description=%(description)s, event_count=event_count+1 WHERE stat_year=%(stat_year)s AND stat_month=%(stat_month)s AND project_name=%(project_name)s", False, parameters)
else:
#Maria febbratio 2021 : Insert non funziona sull achiave primaria project_name quando
#con diversa release developername
#USiamo inert in diverso modo : INSERT INTO ins_duplicate VALUES (4,'Gorilla') ON DUPLICATE KEY UPDATE animal='Gorilla';
#CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s) ON DUPLICATE KEY UPDATE project_name=%(project_name)s", False, parameters)
CrawlerDatabase.execute("INSERT INTO rc_gh_archive (project_name, description, event_count, stat_year, stat_month) VALUES (%(project_name)s, %(description)s, 1, %(stat_year)s, %(stat_month)s)", True, parameters)
#Pavia: se l'evento e' di tipo ReleaseEvent, qui parsiamo le informazioni necessarie per popolare la rc_gh_archive_release
#Commento di Maria February 2021 al momento prendiamo solo progetti della ReleaseEvent
#limitiamo il donwload dei progetti da GitHub
#if decoded["type"] == "ReleaseEvent":
#Pavia: l'"url" e l'"id" della release ora li troviamo sotto payload->release
payload = decoded["payload"]
release = payload["release"]
developer_name = ''
developer_name = repo["name"].rsplit("/",1)[0]
#print developer_name
parameters = {
'project_name': str(repo["name"]),
'developer_name': str(repo["name"].rsplit("/",1)[0]),
'url': str(release["url"]),
'version': str(release["id"])
}
#print("Found release event for project: " + repo["name"] + ", release id: " + str(release["id"]) + ", release url: " + release["url"])
#print ("SELECT count(*) FROM rc_gh_archive_release WHERE project_name = " + repo["name"] + " AND version = " + release["id"]+ " AND developer_name=" + developer_name)
#sqlstring = "SELECT count(*) FROM rc_gh_archive_release a , rc_gh_archive b WHERE a.project_name = '" + repo["name"] + "'" + "AND a.project_name = b.project_name "
#sqlstring = sqlstring + " AND a.version ='" + str(release['id']) + "' AND a.developer_name = '" + str(repo["name"].rsplit("/",1)[0]) + "'"
#print "======================================================="
#print sqlstring
#print "======================================================="
#if CrawlerDatabase.select_int("SELECT count(*) FROM rc_gh_archive_release WHERE project_name = '" + repo["name"] + "' AND version = '" + release["id"]+ "' AND developer_name='" + developer_name + "'") == 0:
# CrawlerDatabase.execute("UPDATE rc_gh_archive_release SET project_name=%(project_name)s AND version=%(version)s AND developer_name = %(developer_name)s AND url=%(url)s", False, parameters)
#else:
#try:
# CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version, developer_name) VALUES (%(project_name)s, %(url)s, %(version)s, %(developer_name)s) ", True, parameters)
#except:
# logger.error("Error INSERTING INTO rc_gh_archive_release: " + str(parameters))
#else:
#logger.info("WARNING: The github project " + str(repo["name"]) + " with release " + str(release["id"]) + " for the developer name " + str(repo["name"].rsplit("/",1)[0]) + " exist.")
logger.debug("Found release event for project: " + repo["name"] + ", release id: " + str(release["id"]) + ", release url: " + release["url"])
CrawlerDatabase.execute("INSERT INTO rc_gh_archive_release (project_name, url, version, developer_name) VALUES (%(project_name)s, %(url)s, %(version)s, %(developer_name)s) ON DUPLICATE KEY UPDATE url = %(url)s, version=%(version)s, developer_name=%(developer_name)s", True, parameters)
except Exception, ex:
logger.error(traceback.format_exc())
#logger.error(str(ex))
# do nothing; sometimes repository is missing
except Exception, ex:
logger.info(str(ex) + " missing in " + line)
except Exception as ex:
traceback.print_exc()
time.sleep(5)
#logger.error(str(ex))
......@@ -18,12 +18,11 @@ logger = Logger.getInstance()
while True:
try:
if Configuration.github_every_n_days > 0:
logger.info("Fetching data from GitHubArchive through RepositoryCrawler")
#before to integrate in the MetadataProject , the crawler should be send to the Analyser
#that metadataproject is ready ...
gh.run()
gh.batch.complete()
#logger.info("Fetching data from GitHubArchive through RepositoryCrawler")
#before to integrate in the MetadataProject , the crawler should be send to the Analyser
#that metadataproject is ready ...
gh.run()
gh.batch.complete()
sleep_time = Configuration.repository_crawler_sleep_time
logger.info("Repository Crawler about to sleep for " + str(sleep_time) + " seconds.")
......
......@@ -4,17 +4,26 @@
#
# Copyright 2014 Bitergium SLL
#Updated on April 2020 for MOPRHEMIC project
#Last updated on July 2021
# Ownership : Engineering Ingegneria Informatica S.p.A.
# Author: Maria Antonietta Di Girolamo
# Year: 2020/2021
#Maintainer:Maria Antonietta Di Girolamo
import logging
import logging.handlers
import base64
import datetime
import ConfigParser
#import ConfigParser
import configparser
import json
import os
import sys
from json import JSONEncoder
from datetime import date
import urllib2
#import urllib2
import urllib.request, urllib.parse, urllib.error
from urllib import request
class Logger():
......@@ -91,35 +100,35 @@ class Logger():
def debug(self,msg):
if self.__stdout and self.__logLevel<=0:
print ('DEBUG: '+ msg)
print(('DEBUG: '+ msg))
#BUG logger does not remember its level; it remembers its file path though
#logging.getLogger( ).setLevel(Configuration.logging_level)
logging.getLogger().debug(msg)
def error(self,msg):
if self.__stdout and self.__logLevel<=3:
print ('ERROR: '+ msg)
print(('ERROR: '+ msg))
#BUG logger does not remember its level; it remembers its file path though
#logging.getLogger( ).setLevel(Configuration.logging_level)
logging.error(msg)
def info(self,msg):
if self.__stdout and self.__logLevel<=1:
print ('INFO: '+msg)
print(('INFO: '+msg))
#BUG logger does not remember its level; it remembers its file path though
#logging.getLogger( ).setLevel(Configuration.logging_level)
logging.getLogger( ).info(msg)
def critical(self,msg):
if self.__stdout and self.__logLevel<=5:
print ('CRITICAL: '+msg)
print(('CRITICAL: '+msg))
#BUG logger does not remember its level; it remembers its file path though
#logging.getLogger( ).setLevel(Configuration.logging_level)
logging.critical(msg)
def fatal(self,msg):
if self.__stdout and self.__logLevel<=4:
print ('FATAL: '+msg)
print(('FATAL: '+msg))
#BUG logger does not remember its level; it remembers its file path though
#logging.getLogger( ).setLevel(Configuration.logging_level)
logging.fatal(msg)
......@@ -127,7 +136,7 @@ class Logger():
def warn(