Module monodikit.models.corpus

Expand source code
import glob
import random
import os

from .document import Chant
from .source import create_source
import pickle


class Corpus:
    """
    A collection of chants loaded from a directory.

    Parameters:
    -----------
    directory : str
        The input directory that holds the data.

    sample : int, optional
        The number of chants that are given as a sample. Defaults to None.

    filter_options : dict or callable, optional
        If a dict is provided, it does a substring match.

        If a callable object is provided, it calls it for every document and creates the document
        if True is returned.

    Returns:
    --------
    Corpus
        A Corpus object containing the chants loaded from the directory.

    Examples:
    ---------
    Load chants with a specific substring match:
    ```
    subcorpus1 = Corpus("../data/*", filter_options={"gattung1": "Sequenz"})
    ```

    Load chants using a custom filtering function:
    ```
    subcorpus2 = Corpus("../data/*", filter_options=lambda chant: chant.gattung1 == "Tropus" or chant.gattung1 == "Sequenz")
    ```
    """

    def __init__(self, directory, sample=0, filters=None):
        if filters is None:
            filters = {}
        self.directory = directory
        self.filters = filters
        self.documents = []
        self.sample = False
        sources = [create_source(source) for source in glob.glob(self.directory)]
        self.sources = {source.sigle: source for source in sources}
        self.load_corpus(sample)

    # TODO: One should create subcorpora without the need to reload the whole corpus

    def __init__(self, directory=None, sample=0, filters=None, use_pkl=None):
        if use_pkl and os.path.exists(use_pkl):
            print("pkl exists, use pkl..")
            with open(use_pkl, "rb") as f:
                corpus_data = pickle.load(f)
                self.directory = corpus_data.directory
                self.filters = corpus_data.filters
                self.documents = corpus_data.documents
                self.sample = corpus_data.sample
                self.sources = corpus_data.sources

        elif directory:
            print("pkl does not exist.")

            self.directory = directory
            self.filters = filters
            self.documents = []
            self.sample = False
            sources = [create_source(source) for source in glob.glob(self.directory)]
            self.sources = {source.sigle: source for source in sources}
            self.load_corpus(sample)
            if use_pkl:
                print("save pkl.")
                with open(use_pkl, "wb") as f:
                    pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        print("end __init__")
            # TODO: One should create subcorpora without the need to reload the whole corpus

    def load_corpus(self, sample):
        if not sample:
            self.sample = False
            self.documents = list(filter(None, [create_document(entry,
                                                                filters=self.filters,
                                                                sources=self.sources)
                                                for source in glob.glob(self.directory)
                                                for entry in
                                                [directory for directory in
                                                 glob.glob(os.path.join(source, "*")) if
                                                 os.path.isdir(directory)]
                                                if check_files_exist(entry)]))  # ! Error when there are missing files?
        else:
            self.sample = True
            entries = [entry for source in glob.glob(self.directory)
                       for entry in
                       [directory for directory in glob.glob(os.path.join(source, "*"))
                        if os.path.isdir(directory)]]
            return list(filter(None, [create_document(entry, filters=self.filters)
                                      for entry in random.sample(entries, sample)
                                      if check_files_exist(entry)]))

    def filter(self, filters):
        if not callable(filters):
            Exception("Error  in filter(): argument has to be a callable")
        if self.filters:
            filters = lambda document_meta, source_meta: filters(document_meta, source_meta) and self.filters(document_meta, source_meta)
        filtered_corpus = Corpus()
        filtered_corpus.documents = [document for document in self.documents if filters(document.meta, self.sources[document.meta.source_id])]
        filtered_corpus.sources = self.sources
        filtered_corpus.filters = filters
        filtered_corpus.sample = self.sample
        return filtered_corpus



    @property
    def document_metadata(self):
        return [document.meta.as_record for document in self.documents]

    @property
    def source_metadata(self):
        return [source.meta.as_record for source in self.sources]


from .genre_specific import ProperTropeComplex


def create_document(entry, filters = None, sources=None):

    """
    Creates a new instance of a document.
    At first checks for a specific type and assigns a genre-specific subclass
    (for example TropeComplex()).
     If no suitable subclass is found, it returns a generic Document() instance.
     Example:
         ```
         document = create_document("./data/manuscriptid/documentid")
         ```
    """
    document_meta = Chant.get_meta(entry)
    if sources:
        source_meta = sources.get(document_meta.source_id, None)
    if filters:
        if callable(filters):
            if not filters(document_meta, source_meta):
                return None
        else:
            Exception("Filters have to be a callable")
   # else:
        # Now checks only for substring
       # for key, value in filters.items():
        #    if value not in document_meta.__getattribute__(key):
         #       return None

    if document_meta.genre == "Tropus":
        return ProperTropeComplex(entry)
    # ... other genres
    else:
        return Chant(entry)


def check_files_exist(path):
    meta_file = os.path.join(path, 'meta.json')
    data_file = os.path.join(path, 'data.json')

    if os.path.isfile(meta_file) and os.path.isfile(data_file):
        return True
    else:
        return False

Functions

def check_files_exist(path)
Expand source code
def check_files_exist(path):
    meta_file = os.path.join(path, 'meta.json')
    data_file = os.path.join(path, 'data.json')

    if os.path.isfile(meta_file) and os.path.isfile(data_file):
        return True
    else:
        return False
def create_document(entry, filters=None, sources=None)

Creates a new instance of a document. At first checks for a specific type and assigns a genre-specific subclass (for example TropeComplex()). If no suitable subclass is found, it returns a generic Document() instance. Example: document = create_document("./data/manuscriptid/documentid")

Expand source code
def create_document(entry, filters = None, sources=None):

    """
    Creates a new instance of a document.
    At first checks for a specific type and assigns a genre-specific subclass
    (for example TropeComplex()).
     If no suitable subclass is found, it returns a generic Document() instance.
     Example:
         ```
         document = create_document("./data/manuscriptid/documentid")
         ```
    """
    document_meta = Chant.get_meta(entry)
    if sources:
        source_meta = sources.get(document_meta.source_id, None)
    if filters:
        if callable(filters):
            if not filters(document_meta, source_meta):
                return None
        else:
            Exception("Filters have to be a callable")
   # else:
        # Now checks only for substring
       # for key, value in filters.items():
        #    if value not in document_meta.__getattribute__(key):
         #       return None

    if document_meta.genre == "Tropus":
        return ProperTropeComplex(entry)
    # ... other genres
    else:
        return Chant(entry)

Classes

class Corpus (directory=None, sample=0, filters=None, use_pkl=None)

A collection of chants loaded from a directory.

Parameters:

directory : str The input directory that holds the data.

sample : int, optional The number of chants that are given as a sample. Defaults to None.

filter_options : dict or callable, optional If a dict is provided, it does a substring match.

If a callable object is provided, it calls it for every document and creates the document
if True is returned.

Returns:

Corpus A Corpus object containing the chants loaded from the directory.

Examples:

Load chants with a specific substring match:

subcorpus1 = Corpus("../data/*", filter_options={"gattung1": "Sequenz"})

Load chants using a custom filtering function:

subcorpus2 = Corpus("../data/*", filter_options=lambda chant: chant.gattung1 == "Tropus" or chant.gattung1 == "Sequenz")
Expand source code
class Corpus:
    """
    A collection of chants loaded from a directory.

    Parameters:
    -----------
    directory : str
        The input directory that holds the data.

    sample : int, optional
        The number of chants that are given as a sample. Defaults to None.

    filter_options : dict or callable, optional
        If a dict is provided, it does a substring match.

        If a callable object is provided, it calls it for every document and creates the document
        if True is returned.

    Returns:
    --------
    Corpus
        A Corpus object containing the chants loaded from the directory.

    Examples:
    ---------
    Load chants with a specific substring match:
    ```
    subcorpus1 = Corpus("../data/*", filter_options={"gattung1": "Sequenz"})
    ```

    Load chants using a custom filtering function:
    ```
    subcorpus2 = Corpus("../data/*", filter_options=lambda chant: chant.gattung1 == "Tropus" or chant.gattung1 == "Sequenz")
    ```
    """

    def __init__(self, directory, sample=0, filters=None):
        if filters is None:
            filters = {}
        self.directory = directory
        self.filters = filters
        self.documents = []
        self.sample = False
        sources = [create_source(source) for source in glob.glob(self.directory)]
        self.sources = {source.sigle: source for source in sources}
        self.load_corpus(sample)

    # TODO: One should create subcorpora without the need to reload the whole corpus

    def __init__(self, directory=None, sample=0, filters=None, use_pkl=None):
        if use_pkl and os.path.exists(use_pkl):
            print("pkl exists, use pkl..")
            with open(use_pkl, "rb") as f:
                corpus_data = pickle.load(f)
                self.directory = corpus_data.directory
                self.filters = corpus_data.filters
                self.documents = corpus_data.documents
                self.sample = corpus_data.sample
                self.sources = corpus_data.sources

        elif directory:
            print("pkl does not exist.")

            self.directory = directory
            self.filters = filters
            self.documents = []
            self.sample = False
            sources = [create_source(source) for source in glob.glob(self.directory)]
            self.sources = {source.sigle: source for source in sources}
            self.load_corpus(sample)
            if use_pkl:
                print("save pkl.")
                with open(use_pkl, "wb") as f:
                    pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        print("end __init__")
            # TODO: One should create subcorpora without the need to reload the whole corpus

    def load_corpus(self, sample):
        if not sample:
            self.sample = False
            self.documents = list(filter(None, [create_document(entry,
                                                                filters=self.filters,
                                                                sources=self.sources)
                                                for source in glob.glob(self.directory)
                                                for entry in
                                                [directory for directory in
                                                 glob.glob(os.path.join(source, "*")) if
                                                 os.path.isdir(directory)]
                                                if check_files_exist(entry)]))  # ! Error when there are missing files?
        else:
            self.sample = True
            entries = [entry for source in glob.glob(self.directory)
                       for entry in
                       [directory for directory in glob.glob(os.path.join(source, "*"))
                        if os.path.isdir(directory)]]
            return list(filter(None, [create_document(entry, filters=self.filters)
                                      for entry in random.sample(entries, sample)
                                      if check_files_exist(entry)]))

    def filter(self, filters):
        if not callable(filters):
            Exception("Error  in filter(): argument has to be a callable")
        if self.filters:
            filters = lambda document_meta, source_meta: filters(document_meta, source_meta) and self.filters(document_meta, source_meta)
        filtered_corpus = Corpus()
        filtered_corpus.documents = [document for document in self.documents if filters(document.meta, self.sources[document.meta.source_id])]
        filtered_corpus.sources = self.sources
        filtered_corpus.filters = filters
        filtered_corpus.sample = self.sample
        return filtered_corpus



    @property
    def document_metadata(self):
        return [document.meta.as_record for document in self.documents]

    @property
    def source_metadata(self):
        return [source.meta.as_record for source in self.sources]

Subclasses

Instance variables

var document_metadata
Expand source code
@property
def document_metadata(self):
    return [document.meta.as_record for document in self.documents]
var source_metadata
Expand source code
@property
def source_metadata(self):
    return [source.meta.as_record for source in self.sources]

Methods

def filter(self, filters)
Expand source code
def filter(self, filters):
    if not callable(filters):
        Exception("Error  in filter(): argument has to be a callable")
    if self.filters:
        filters = lambda document_meta, source_meta: filters(document_meta, source_meta) and self.filters(document_meta, source_meta)
    filtered_corpus = Corpus()
    filtered_corpus.documents = [document for document in self.documents if filters(document.meta, self.sources[document.meta.source_id])]
    filtered_corpus.sources = self.sources
    filtered_corpus.filters = filters
    filtered_corpus.sample = self.sample
    return filtered_corpus
def load_corpus(self, sample)
Expand source code
def load_corpus(self, sample):
    if not sample:
        self.sample = False
        self.documents = list(filter(None, [create_document(entry,
                                                            filters=self.filters,
                                                            sources=self.sources)
                                            for source in glob.glob(self.directory)
                                            for entry in
                                            [directory for directory in
                                             glob.glob(os.path.join(source, "*")) if
                                             os.path.isdir(directory)]
                                            if check_files_exist(entry)]))  # ! Error when there are missing files?
    else:
        self.sample = True
        entries = [entry for source in glob.glob(self.directory)
                   for entry in
                   [directory for directory in glob.glob(os.path.join(source, "*"))
                    if os.path.isdir(directory)]]
        return list(filter(None, [create_document(entry, filters=self.filters)
                                  for entry in random.sample(entries, sample)
                                  if check_files_exist(entry)]))