hsh_triplify/triplify.py

371 lines
11 KiB
Python

# coding=utf-8
#SQL
from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import relationship, backref
Base = declarative_base()
class HsHOrganizationalUnitType(Base):
__tablename__ = 'organizational_unit_type'
id = Column(Integer, primary_key=True)
name = Column(String)
class HsHOrganizationalUnit(Base):
__tablename__ = 'organizational_unit';
id = Column(Integer, primary_key=True)
name = Column(String)
parent = Column(Integer, ForeignKey('organizational_unit.id'))
acronym = Column(String)
post_address = Column(Integer)
organizational_unit_type = Column(Integer, ForeignKey('organizational_unit_type.id'))
oParent = relationship("HsHOrganizationalUnit", remote_side=[id])
class HsHMembership(Base):
__tablename__ = 'membership'
id = Column(Integer, primary_key=True)
person = Column(Integer, ForeignKey('person.id'))
organizational_unit = Column(Integer, ForeignKey('organizational_unit.id'))
start_date = Column(DateTime)
end_date = Column(DateTime)
active = Column(String)
business_role = Column(Integer, ForeignKey('business_role.id'))
mPerson = relationship("HsHPerson")
mBusinessRole = relationship("HsHBusinessRole")
mOrg = relationship("HsHOrganizationalUnit")
class HsHBusinessRole(Base):
__tablename__ = 'business_role'
id = Column(Integer, primary_key=True)
name = Column(String)
description = Column(String)
class HsHPerson(Base):
__tablename__ = 'person'
id = Column(Integer, primary_key=True)
name = Column(String)
firstname = Column(String)
title_prefix = Column(String)
title_suffix = Column(String)
gender = Column(String)
#birthday = Column(DateTime) #TODO: without timezone!
memberships = relationship("HsHMembership")
class HsHTelephone(Base):
__tablename__ = 'telephone'
id = Column(Integer, primary_key=True)
call_number = Column(String)
description = Column(String)
#RDF
from rdflib import Namespace
from rdfalchemy import rdfSingle, rdfMultiple
from rdfalchemy.rdfSubject import rdfSubject
from rdflib import Literal, BNode, Namespace, URIRef
from rdflib import RDF, RDFS, Graph, OWL
from rdflib.namespace import XSD
foaf = Namespace('http://xmlns.com/foaf/0.1/')
vitro = Namespace('http://vitro.mannlib.cornell.edu/ns/vitro/0.7')
vivo = Namespace('http://vivoweb.org/ontology/core#')
vcard = Namespace('http://www.w3.org/2006/vcard/ns#')
obo = Namespace('http://purl.obolibrary.org/obo/')
hsh = Namespace('http://data.hs-hannover.de/ontology/hshOntologie#')
localPerson = Namespace('http://data.hs-hannover.de/individual/p')
localOrg = Namespace('http://data.hs-hannover.de/individual/o')
localMembership = Namespace('http://data.hs-hannover.de/individual/m')
def get_graph():
"""Small little helper to create graph with namespaces ;-)"""
g = rdfSubject.db
g.bind('foaf', foaf)
g.bind('vitro', vitro)
g.bind('vivo', vivo)
g.bind('vcard', vcard)
g.bind('obo', obo)
g.bind('localperson', localPerson)
g.bind('localorg', localOrg)
g.bind('localMembership', localMembership)
g.bind('hsh', hsh)
return g
class Thing(rdfSubject):
rdf_type = OWL.Thing
label = rdfSingle(RDFS.label)
class hshThing(Thing):
rdf_type = hsh.hshLocal
class Person(hshThing):
rdf_type = foaf.Person
firstname = rdfSingle(vcard.givenName)
name = rdfSingle(vcard.familyName)
hasContactInfo = rdfSingle(obo.ARG_2000028)
label = rdfSingle(RDFS.label)
associatedOe = rdfSingle(vivo.relatedBy)
class FacultyMember(Person):
rdf_type = vivo.FacultyMember
class NonFacultyAcademic(Person):
rdf_type = vivo.NonFacultyAcademic
class NonAcademic(Person):
rdf_type = vivo.NonAcademic
class ContactInfo(Thing):
rdf_type = vcard.Individual
hasTitle = rdfSingle(vcard.hasTitle,range_type=vcard.Title)
contactInformationFor = rdfSingle(obo.ARG_2000029,range_type=foaf.Person)
class Title(Thing):
rdf_type = vcard.Title
title = rdfSingle(vcard.title,range_type=XSD.String)
class Organization(hshThing):
rdf_type = foaf.Organization
label = rdfSingle(RDFS.label)
parentOe = rdfSingle(obo.BFO_0000050)
acronym = rdfSingle(vivo.abbreviation)
class OboMembership(Thing):
rdf_type = obo.BFO_0000020
class Membership(OboMembership):
rdf_type = vivo.Position
label = rdfSingle(RDFS.label)
relates = rdfMultiple(vivo.relates)
dateTimeInterval = rdfSingle(vivo.dateTimeInterval)
class FacultyPosition(Membership):
rdf_type = vivo.FacultyPosition
class NonFacultyPosition(Membership):
rdf_type = vivo.NonFacultyPosition
class NonAcademicPosition(Membership):
rdf_type = vivo.NonAcademicPosition
class OboDateTimeInterval(Thing):
rdf_type = obo.BFO_0000038
class DateTimeInterval(OboDateTimeInterval):
rdf_type = vivo.DateTimeInterval
start = rdfSingle(vivo.start)
end = rdfSingle(vivo.end)
class OboDateTimeValue(Thing):
rdf_type = obo.BFO_0000148
class DateTimeValue(OboDateTimeValue):
rdf_type = vivo.DateTimeValue
dateTime = rdfSingle(vivo.dateTime)
dateTimePrecision = rdfSingle(vivo.dateTimePrecision)
class IdSequence:
"""Small Helper for easy sequences"""
def __init__(self, start):
self.num = start - 1
def getNext(self):
self.num += 1
return self.num
def getBusinessRoleName(membership):
"""Gender based mapping for business role names."""
names = {
"M": {
1000000000: "Beschäftigter",
1000000001: "Professor",
1000000002: "Wissenschaftlicher Mitarbeiter",
1000000003: "Auszubildender",
1000000004: "Lehrkraft für besondere Aufgaben",
1000000005: "Lehrbeauftragter",
1000000006: "Praktikant",
1000000007: "Wissenschaftliche Hilfskraft",
1000000008: "Studentische Hilfskraft",
1000000009: "Lehrkraft",
1000000010: "Leiter"
},
"F": {
1000000000: "Beschäftigte",
1000000001: "Professorin",
1000000002: "Wissenschaftliche Mitarbeiterin",
1000000003: "Auszubildende",
1000000004: "Lehrkraft für besondere Aufgaben",
1000000005: "Lehrbeauftragte",
1000000006: "Praktikantin",
1000000007: "Wissenschaftliche Hilfskraft",
1000000008: "Studentische Hilfskraft",
1000000009: "Lehrkraft",
1000000010: "Leiterin"
},
"?": {
1000000000: "Beschäftigte(r)",
1000000001: "ProfessorIn",
1000000002: "Wissenschaftliche(r) MitarbeiterIn",
1000000003: "Auszubildende(r)",
1000000004: "Lehrkraft für besondere Aufgaben",
1000000005: "Lehrbeauftragte(r)",
1000000006: "PraktikantIn",
1000000007: "Wissenschaftliche Hilfskraft",
1000000008: "Studentische Hilfskraft",
1000000009: "Lehrkraft",
1000000010: "LeiterIn"
}
}
gender = membership.mPerson.gender
business_role = membership.mBusinessRole.id
if gender == None:
gender = "?"
return names[gender][business_role]
def addPersonToGraph(sqlP, additionalIdSeq):
"""Add a given HsHPerson to the graph."""
combined_title = ''
if sqlP.title_prefix != None:
combined_title = sqlP.title_prefix
if sqlP.title_suffix != None:
if combined_title == '':
combined_title = sqlP.title_suffix
else:
combined_title += ' ' + sqlP.title_suffix
person_uri = URIRef("%s%s" % (localPerson, sqlP.id))
hshThing(person_uri)
rdfP = Person(person_uri) #WTF?!
rdfP.firstname = sqlP.firstname;
rdfP.name = sqlP.name;
rdfP.label = "%s, %s" % (sqlP.name, sqlP.firstname)
# If there is a title to add, do it.
if(combined_title != ''):
title_uri = URIRef("%s%s" % (localPerson, additionalIdSeq.getNext()))
rdfTitle = Title(title_uri)
rdfTitle.title = combined_title
contact_info_uri = URIRef("%s%s" % (localPerson, additionalIdSeq.getNext()))
rdfCi = ContactInfo(contact_info_uri)
rdfCi.hasTitle = rdfTitle
rdfCi.contactInformationFor = rdfP
rdfP.hasContactInfo = rdfCi
return person_uri
def processMembership(membership, additionalIdSeq):
#create membership
membershipUri = URIRef("%s%s" % (localMembership, membership.id))
personUri = URIRef("%s%s" % (localPerson, membership.mPerson.id))
organizationUri = URIRef("%s%s" % (localOrg, membership.mOrg.id))
#dtiUri = URIRef("%s%s" % (localMembership, additionalIdSeq.getNext()))
#dtsUri = URIRef("%s%s" % (localMembership, additionalIdSeq.getNext()))
#dteUri = URIRef("%s%s" % (localMembership, additionalIdSeq.getNext()))
rdfM = OboMembership(membershipUri)
rdfM = Membership(membershipUri)
rdfM.relates = [personUri, organizationUri]
rdfM.label = getBusinessRoleName(membership)
#rdfM.dateTimeInterval = dtiUri
if membership.mBusinessRole.name == "ProfessorIn":
rdfM = FacultyPosition(membershipUri)
elif membership.mBusinessRole.name == "WiMi":
rdfM = NonFacultyPosition(membershipUri)
else:
rdfM = NonAcademicPosition(membershipUri)
#rdfDts = OboDateTimeValue(dtsUri)
#rdfDts = DateTimeValue(dtsUri)
#rdfDts.dateTime = membership.start_date
#rdfDts.dateTimePrecision = vivo.yearPrecision
#rdfDte = OboDateTimeValue(dteUri)
#rdfDte = DateTimeValue(dteUri)
#rdfDte.dateTime = membership.end_date
#rdfDte.dateTimePrecision = vivo.yearPrecision
#rdfDti = OboDateTimeInterval(dtiUri)
#rdfDti = DateTimeInterval(dtiUri)
#rdfDti.start = dtsUri
#rdfDti.end = dteUri
def processPersons(session, additionalIdSeq):
"""Fetch persons, create triples for them."""
persons = session.query(HsHPerson)
for count, sqlP in enumerate(persons):
if sqlP.memberships == []:
continue #SKIP PEOPLE WITHOUT ANY MEMBERSHIPS AT ALL!
#create person
personUri = addPersonToGraph(sqlP, additionalIdSeq)
#used to determine whether person is non-academic type
isNonAcademic = True
#use memberships to determine person type
for i, membership in enumerate(sqlP.memberships):
if membership.active == False:
continue #SKIP INACTIVE MEMBERSHIPS!
processMembership(membership, additionalIdSeq)
#This is an active membership with an id and a business_role.
if membership.mBusinessRole.name == 'ProfessorIn':
isNonAcademic = False
organization_uri = URIRef("%s%s" % (localOrg, membership.organizational_unit))
rdfP = FacultyMember(personUri)
rdfP.associatedOe = organization_uri
elif membership.mBusinessRole.name == 'WiMi':
isNonAcademic = False
organization_uri = URIRef("%s%s" % (localOrg, membership.organizational_unit))
rdfP = NonFacultyAcademic(personUri)
rdfP.associatedOe = organization_uri
#assign non-academic person type if neccessary
if isNonAcademic:
NonAcademic(personUri)
def processOrganizations(session, additionalIdSeq):
"""Generate triples for organizational units."""
organizations = session.query(HsHOrganizationalUnit)
for count, sqlO in enumerate(organizations):
organization_uri = URIRef("%s%s" % (localOrg, sqlO.id))
rdfO = hshThing(organization_uri)
rdfO = Organization(organization_uri)
rdfO.label = sqlO.name
if sqlO.acronym != None:
rdfO.acronym = sqlO.acronym
if sqlO.oParent != None:
parent_organization_uri = URIRef("%s%s" % (localOrg, sqlO.oParent.id))
rdfO.parentOe = parent_organization_uri
def createTriples():
"""Fetch memberships, evaluate them, create persons,..."""
engine = create_engine("postgresql://hshinfo:hshinfotest@141.71.2.152/hshinfo")
session = sessionmaker(bind=engine)()
g = get_graph()
additionalIdSeq = IdSequence(2000000000) #don't care sequence
processOrganizations(session, additionalIdSeq)
processPersons(session, additionalIdSeq)
#we're done.
triples = g.serialize(format='n3')
g.close()
return triples
if __name__ == '__main__':
writeToFile = True
triples = createTriples()
print(triples)
if writeToFile == True:
f = open('data.n3', 'wb')
f.write(triples)
f.close()