hsh_triplify/triplify.py

245 lines
7.3 KiB
Python
Raw Normal View History

2014-02-14 15:01:34 +01:00
#SQL
from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import relationship, backref
2014-02-14 15:01:34 +01:00
Base = declarative_base()
class HsHOrganizationalUnitType(Base):
__tablename__ = 'organizational_unit_type'
id = Column(Integer, primary_key=True)
name = Column(String)
class HsHOrganizationalUnit(Base):
__tablename__ = 'organizational_unit';
id = Column(Integer, primary_key=True)
name = Column(String)
parent = Column(Integer, ForeignKey('organizational_unit.id'))
acronym = Column(String)
post_address = Column(Integer)
organizational_unit_type = Column(Integer, ForeignKey('organizational_unit_type.id'))
oParent = relationship("HsHOrganizationalUnit", remote_side=[id])
2014-02-14 15:01:34 +01:00
class HsHMembership(Base):
__tablename__ = 'membership'
id = Column(Integer, primary_key=True)
person = Column(Integer, ForeignKey('person.id'))
organizational_unit = Column(Integer)
start_date = Column(DateTime)
end_date = Column(DateTime)
active = Column(String)
business_role = Column(Integer, ForeignKey('business_role.id'))
2014-02-14 15:01:34 +01:00
mPerson = relationship("HsHPerson")
mBusinessRole = relationship("HsHBusinessRole")
class HsHBusinessRole(Base):
__tablename__ = 'business_role'
id = Column(Integer, primary_key=True)
name = Column(String)
description = Column(String)
2014-02-14 15:01:34 +01:00
class HsHPerson(Base):
__tablename__ = 'person'
id = Column(Integer, primary_key=True)
name = Column(String)
firstname = Column(String)
title_prefix = Column(String)
title_suffix = Column(String)
account = Column(Integer)
gender = Column(String)
#birthday = Column(DateTime) #TODO: without timezone!
memberships = relationship("HsHMembership")
2014-02-14 15:01:34 +01:00
class HsHTelephone(Base):
__tablename__ = 'telephone'
id = Column(Integer, primary_key=True)
call_number = Column(String)
description = Column(String)
#RDF
from rdflib import Namespace
from rdfalchemy import rdfSingle
from rdfalchemy.rdfSubject import rdfSubject
from rdflib import Literal, BNode, Namespace, URIRef
from rdflib import RDF, RDFS, Graph, OWL
from rdflib.namespace import XSD
foaf = Namespace('http://xmlns.com/foaf/0.1/')
core = Namespace('http://vivoweb.org/ontology/core#')
vitro = Namespace('http://vitro.mannlib.cornell.edu/ns/vitro/0.7')
vivo = Namespace('http://vivoweb.org/ontology/core#')
2014-02-14 15:01:34 +01:00
vcard = Namespace('http://www.w3.org/2006/vcard/ns#')
obo = Namespace('http://purl.obolibrary.org/obo/')
hsh = Namespace('http://vivo.bib.hs-hannover.de/ontology/hshOntologie#')
localPerson = Namespace('http://vivo.bib.hs-hannover.de/individual/person')
localOrg = Namespace('http://vivo.bib.hs-hannover.de/individual/org')
2014-02-14 15:01:34 +01:00
def get_graph():
"""Small little helper to create graph with namespaces ;-)"""
g = rdfSubject.db
g.bind('foaf', foaf)
g.bind('core', core)
g.bind('vitro', vitro)
g.bind('vivo', vivo)
2014-02-14 15:01:34 +01:00
g.bind('vcard', vcard)
g.bind('obo', obo)
g.bind('localperson', localPerson)
g.bind('localorg', localOrg)
2014-02-14 15:01:34 +01:00
g.bind('hsh', hsh)
return g
class Thing(rdfSubject):
rdf_type = OWL.Thing
label = rdfSingle(RDFS.label)
class hshThing(Thing):
rdf_type = hsh.hshLocal
class Person(hshThing):
rdf_type = foaf.Person
firstname = rdfSingle(vcard.givenName)
name = rdfSingle(vcard.familyName)
hasContactInfo = rdfSingle(obo.ARG_2000028)
label = rdfSingle(RDFS.label)
associatedOe = rdfSingle(vivo.relatedBy)
class FacultyMember(Person):
rdf_type = vivo.FacultyMember
class NonFacultyAcademic(Person):
rdf_type = vivo.NonFacultyAcademic
class NonAcademic(Person):
rdf_type = vivo.NonAcademic
2014-02-14 15:01:34 +01:00
class ContactInfo(Thing):
rdf_type = vcard.Individual
hasTitle = rdfSingle(vcard.hasTitle,range_type=vcard.Title)
contactInformationFor = rdfSingle(obo.ARG_2000029,range_type=foaf.Person)
2014-02-14 15:01:34 +01:00
class Title(Thing):
rdf_type = vcard.Title
title = rdfSingle(vcard.title,range_type=XSD.String)
class Organization(hshThing):
rdf_type = foaf.Organization
label = rdfSingle(RDFS.label)
parentOe = rdfSingle(obo.BFO_0000050)
2014-02-14 15:01:34 +01:00
class IdSequence:
"""Small Helper for easy sequences"""
def __init__(self, start):
self.num = start - 1
def getNext(self):
self.num += 1
return self.num
def addPersonToGraph(sqlP, additionalIdSeq):
"""Add a given HsHPerson to the graph."""
combined_title = ''
if sqlP.title_prefix != None:
combined_title = sqlP.title_prefix
if sqlP.title_suffix != None:
if combined_title == '':
combined_title = sqlP.title_suffix
else:
combined_title += ' ' + sqlP.title_suffix
person_uri = URIRef("%s/%s" % (localPerson, sqlP.id))
hshThing(person_uri)
rdfP = Person(person_uri) #WTF?!
rdfP.firstname = sqlP.firstname;
rdfP.name = sqlP.name;
rdfP.label = "%s, %s" % (sqlP.name, sqlP.firstname)
# If there is a title to add, do it.
if(combined_title != ''):
title_uri = URIRef("%s/%s" % (localPerson, additionalIdSeq.getNext()))
rdfTitle = Title(title_uri)
rdfTitle.title = combined_title
contact_info_uri = URIRef("%s/%s" % (localPerson, additionalIdSeq.getNext()))
rdfCi = ContactInfo(contact_info_uri)
rdfCi.hasTitle = rdfTitle
rdfCi.contactInformationFor = rdfP
rdfP.hasContactInfo = rdfCi
return person_uri
def processPersons(session, additionalIdSeq):
"""Fetch persons, create triples for them."""
persons = session.query(HsHPerson)
for count, sqlP in enumerate(persons):
if sqlP.memberships == []:
continue #SKIP PEOPLE WITHOUT ANY MEMBERSHIPS AT ALL!
#create person
personUri = addPersonToGraph(sqlP, additionalIdSeq)
#used to determine whether person is non-academic type
isNonAcademic = True
#use memberships to determine person type
for i, membership in enumerate(sqlP.memberships):
if membership.active != 'Y':
continue #SKIP INACTIVE MEMBERSHIPS!
#This is an active membership with an id and a business_role.
if membership.mBusinessRole.name == 'ProfessorIn':
isNonAcademic = False
organization_uri = URIRef("%s/%s" % (localOrg, membership.organizational_unit))
rdfP = FacultyMember(personUri)
rdfP.associatedOe = organization_uri
if membership.mBusinessRole.name == 'WiMi':
isNonAcademic = False
organization_uri = URIRef("%s/%s" % (localOrg, membership.organizational_unit))
rdfP = NonFacultyAcademic(personUri)
rdfP.associatedOe = organization_uri
#assign non-academic person type if neccessary
if isNonAcademic:
NonAcademic(personUri)
pass
def processOrganizations(session, additionalIdSeq):
"""Generate triples for organizational units."""
organizations = session.query(HsHOrganizationalUnit)
for count, sqlO in enumerate(organizations):
organization_uri = URIRef("%s/%s" % (localOrg, sqlO.id))
rdfO = hshThing(organization_uri)
rdfO = Organization(organization_uri)
rdfO.label = sqlO.name
if sqlO.oParent != None:
parent_organization_uri = URIRef("%s/%s" % (localOrg, sqlO.oParent.id))
rdfO.parentOe = parent_organization_uri
def createTriples():
"""Fetch memberships, evaluate them, create persons,..."""
2014-02-14 15:01:34 +01:00
engine = create_engine("postgresql://hshinfo:hshinfotest@141.71.2.152/hshinfo")
session = sessionmaker(bind=engine)()
g = get_graph()
additionalIdSeq = IdSequence(2000000000) #don't care sequence
processOrganizations(session, additionalIdSeq)
processPersons(session, additionalIdSeq)
#we're done.
2014-02-14 15:01:34 +01:00
triples = g.serialize(format='n3')
g.close()
return triples
2014-02-14 15:01:34 +01:00
if __name__ == '__main__':
writeToFile = True
triples = createTriples()
print(triples)
if writeToFile == True:
f = open('data.n3', 'wb')
f.write(triples)
f.close()