Source code for dspace

import hashlib
import urllib2
import xml.etree.ElementTree as ET
from pprint import pprint
import string
import unicodedata

[docs]class DSpace:
    """
    Talk to DSpace!
    """
    
    def __init__(self, public_key, private_key, rest_path):
        """
        Class for interacting with the ASU Digital HPS Community Repository
        custom API. https://github.com/mbl-cli/DspaceTools/wiki/API
        
        Parameters
        ----------
        public_key : string
        private_key : string
        rest_path : string
            URL for RESTful API endpoint
        """
        self.public_key = public_key
        self.private_key = private_key
        self.rest_path = rest_path

[docs]    def get_digest(self, path):
        """
        Produces an authentication digest based on resource path and your 
        private key.
        
        Parameters
        ----------
        path : string
            Relative URL of desired resource. E.g. '/items.xml'
            
        Returns
        -------
        string : authentication digest for desired resource.
        """
        
        m = hashlib.sha1('/rest' + path + private_key)
        return m.hexdigest()[0:8]
        
[docs]    def get_path(self, path, idOnly=False):
        """
        Produces a full path for the desired resource.
        
        Parameters
        ----------
        path : string
            Relative URL of desired resource. E.g. '/items.xml'
        idOnly : boolean
            If True, the returned path will yield only id/reference information
            for the desired resource. Default is False.
        
        Returns
        -------
        string : full URL of desired resource, including authentication 
            information.
        """
        
        digest = self.get_digest(resource, self.private_key)
        return self.rest_path + path + "?api_key=" + self.public_key + \
                "&api_digest=" + digest + "&idOnly=" + str(idOnly).lower()

[docs]    def get_element_from_resource(self, path, idOnly=False):
        """
        Retrieves the desired resource from the DSpace API, and returns an 
        ElementTree root node.
        
        Parameters
        ----------
        path : string
            Relative URL of desired resource. E.g. '/items.xml'
        idOnly : boolean
            If True, will yield only id/reference information for the desired 
            resource. Default is False.
        
        Returns
        -------
        ElementTree node : containing API response.
        """
        
        request_path = self.get_path(path, idOnly)
        response = urllib2.urlopen(request_path).read()
        return ET.fromstring(response)

[docs]    def clean_text(self, s):
        """
        Gets rid of garbage.
        
        Parameters
        ----------
        s : string
            A messy string.
        
        Returns
        -------
        string : A somewhat cleaner string.
        """
        
        norm = unicodedata.normalize('NFKD', unicode(s))
        return  norm.encode('ascii', 'ignore').rstrip().replace('\n','')

[docs]    def dict_from_node(self, node, recursive=False):
        """
        Converts ElementTree node to a dictionary. 
        
        Parameters
        ----------
        node : ElementTree node
        recursive : boolean
            If recursive=False, the value of any field with children will be the
            number of children.
        
        Returns
        -------
        dict : nested dictionary. 
            Tags as keys and values as values. Sub-elements that occur multiple 
            times in an element are contained in a list.
        """
        
        dict = {}
        for snode in node:
            if len(snode) > 0:
                if recursive:
                    # Will drill down until len(snode) <= 0.
                    value = self.dict_from_node(snode, True) 
                else:
                    value = len(snode)
            else:
                value = self.clean_text(snode.text)
                
            if snode.tag in dict.keys():    # If there are multiple subelements 
                                            #  with the same tag, then the value 
                                            #  of the element should be a list 
                                            #  rather than a dict.
                if type(dict[snode.tag]) is list:   # If a list has already been
                                                    #  started, just append to 
                                                    #  it.
                    dict[snode.tag].append(value)
                else:
                    dict[snode.tag] = [ dict[snode.tag], value ]
            else:
                dict[snode.tag] = value     # Default behavior.
        return dict

[docs]    def communities(self):
        """
        Retrieves all of the communities to which the user has access.
        
        Returns
        -------
        list : a list of nested dictionaries.
        """
        
        root = self.get_element_from_resource('/communities.xml')
        
        C = []
        for node in root:
            C.append(self.dict_from_node(node, True))
        return C

[docs]    def community(self, community):
        """
        Retrieves details about a specific community, by id.
        
        Parameters
        ----------
        community : string or int
            Community id.
        
        Returns
        -------
        dict : a nested dictionary.
        """
        
        path = '/communities/'+str(community)+'.xml'
        root = self.get_element_from_resource(path)
        return self.dict_from_node(root, True)

[docs]    def list_collections(self, community):
        """
        Retrieves details about the collections in a community.
        
        Parameters
        ----------
        community : string or int
            Community id.
            
        Returns
        -------
        list : a list of nested dictionaries.      
        """
        
        return self.community(community)['collections']['collectionentityid']

[docs]    def list_collection_ids(self, community):
        """
        Returns a list of collection IDs for a given community.
        
        Parameters
        ----------
        community : string or int
            Community id.
            
        Returns
        -------
        list : a list of collection ids.
        """
        
        return [ c['id'] for c in self.collections(community) ]
                   
[docs]    def collection(self, collection):
        """
        Retrieves details for a specific collection, by id.
        
        Parameters
        ----------
        collection : string or int
            Collection id.
            
        Returns
        -------
        dict: a nested dictionary.
        
        """
        
        path = '/collections/'+str(collection)+'.xml'
        root = self.get_element_from_resource(path)
        return self.dict_from_node(root, True)

[docs]    def list_items(self, collection):
        """
        Retrieves details about all items in a collection.
        
        Parameters
        ----------
        collection : string or int
            Collection id.
            
        Returns
        -------
        list : a list of nested dictionaries.
        """

        return self.collection(collectiond)['items']['itementity']
    
[docs]    def list_item_ids(self, collection):
        """
        Returns a list of item IDs for a given collection.
        
        Parameters
        ----------
        collection : string or int
            Collection id.
            
        Returns
        -------
        list : a list of item ids.
        """
        
        return [ i['id'] for i in self.items(collection) ]
    
[docs]    def item(self, item):
        """
        Retrieve an item by id.
        
        Parameters
        ----------
        item : string or int
            An item id.
        
        Returns
        -------
        dict : a nested dictionary.
        """
    
        path = '/items/' + str(item) + '.xml'
        root = self.get_element_from_resource(path)
        return self.dict_from_node(root, True)

[docs]    def item_metadata(self, item):
        """
        Returs metadata for an item as a simple dictionary, with dc fields
        as keys.
        
        Parameters
        ----------
        item : string or int
            An item id.
            
        Returns
        -------
        dict : metadata, with dc fields as keys.
        """
        
        i = self.item(item)
        return { me['element'] + '.' + me['qualifier']:me['value'] for me \
                  in i['metadata']['metadataentity'] }

[docs]    def all_collections(self):
        """
        Retrieves details about all of the collections to which a user has
        access.
        
        Returns
        -------
        list : a list of nested dictionaries.
        """
        
        root = self.get_element_from_resource('/collections.xml')
        return [ self.dict_from_node(node) for node in root ]
    
[docs]    def list_bitstream_ids(self, item):
        """
        Returns a list of bitstream ids for an item.

        Parameters
        ----------
        item : string or int
            An item id.
            
        Returns
        -------        
        list : a list of bitstream ids.
        """
        
        i = self.item(item)
        if type(i['bitstreams']['bitstreamentity']) is dict:    # One bitstream.
            return [ i['bitstreams']['bitstreamentity']['id'] ]
        else:
            return [ be['id'] for be in i['bitstreams']['bitstreamentity'] ]
    
[docs]    def bitstream(self, bitstream):
        """
        Returns information about a bitstream.

        Parameters
        ----------
        item : string or int
            An item id.
            
        Returns
        -------
        dict : a nested dictionary.          
        """
        
        path = '/bitstream/' + str(bitstream) + '.xml'
        root = self.get_element_from_resource()
        bitstreamentities = root.findall('.//bitstreamentity')
        for b in bitstreamentities:
            if self.dict_from_node(b)['id'] == str(bitstream):
                return self.dict_from_node(b, True)
    
[docs]    def get_bitstream(self, bitstream, save_path=None):
        """
        Downloads a bitstream and handles it. If save_path is provided, returns
        a file pointer. Otherwise returns the content of the bitstream.
        
        Parameters
        ----------
        bitstream : string or int
            A bitstream id.
        save_path : string or None
            Full path where bitstream should be saved, including the filename.
        
        Returns
        -------
        Contents of bitstream, or file pointer.
        
        Notes
        -----
        WARNING: This has only been tested on bitstreams containing text data!
        
        TODO
        ----
        More robust handling for different data types.
        """

        rpath = self.get_path('/bitstream/' + str(bitstream))
        r = urllib2.urlopen(rpath)
        data = r.read()
        if save_path is None:
            return data
        else:
            with open(save_path, 'w') as f:
                f.write(data)
                return f
Navigation

Source code for dspace

Quick search

Navigation