Browse Source

Upload files to 'labels/crowd_labeling'

Luca Pion-Tonachini 5 years ago
parent
commit
0d0083d380

+ 525 - 0
labels/crowd_labeling/CLLDA.py

@@ -0,0 +1,525 @@
+import numpy as np
+try:
+    from scipy.stats import mode
+except ImportError:
+    def mode(a, axis=0):
+        scores = np.unique(np.ravel(a))  # get ALL unique values
+        testshape = list(a.shape)
+        testshape[axis] = 1
+        oldmostfreq = np.zeros(testshape)
+        oldcounts = np.zeros(testshape)
+
+        for score in scores:
+            template = (a == score)
+            counts = np.expand_dims(np.sum(template, axis), axis)
+            mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
+            oldcounts = np.maximum(counts, oldcounts)
+            oldmostfreq = mostfrequent
+
+        return mostfrequent, oldcounts
+import multiprocessing as mp
+from time import time
+from copy import deepcopy
+from crowd_labeling.logratio_transformations import \
+    centered_log_ratio_transform as clr, \
+    isometric_log_ratio_transform as ilr, \
+    additive_log_ratio_transform as alr, \
+    make_projection_matrix as mpm
+
+
+class CLLDA:
+    """
+    The :class: CLLDA is a python implementation of Crowd Labeling Latent Dirichlet Allocation.
+
+    This algorithm processes crowd labeling (aka crowd consensus) data where workers label instances
+    as pertaining to one or more classes. Allows for calculating resulting label estimates and
+    covariances in multiple log-ratio transformed spaces.
+    """
+
+    def __init__(self, votes, workers, instances, vote_ids=None, worker_ids=None, instance_ids=None,
+                 worker_prior=None, instance_prior=None, transform=None,
+                 num_epochs=1000, burn_in=200, updateable=True, save_samples=False, seed=None):
+
+        """
+        Initializes settings for the model and automatically calls the inference function.
+
+        :param votes: List of vote values.
+        :param workers: List of uses who submitted :param votes.
+        :param instances: List of instances to which the :param votes pertain.
+        :param vote_ids: (optional) List of vote ids. If provided, :param votes should be a list of integers.
+        :param instance_ids: (optional) List of instance ids. If provided, :param instances should be a list of integers.
+        :param worker_ids: (optional) List of worker ids. If provided, :param workers should be a list of integers.
+        :param worker_prior: (optional) Matrix prior for worker skill (pseudovotes).
+        :param instance_prior: (optional) List of class priors (pseudovotes)
+        :param transform: log-ratio transform to use.
+        :param num_epochs: number of epochs to run for.
+        :param burn_in: number of epochs to ignore for convergence of the Gibbs chain.
+        :param updateable: If True, will save vote-classes between runs at the expense of memory.
+        :param save_samples: option to save vote-classes after each epoch (very memory intensive).
+        :param seed: seed for the random number generator if reproducibility is desired.
+        """
+
+        # set random seed
+        if seed is None:
+            seed = np.random.randint(int(1e8))
+        self.rng = np.random.RandomState(seed=seed)
+
+        # data info and priors
+        self.V = len(votes)
+        self.U = len(np.unique(workers))
+        self.I = len(np.unique(instances))
+        if worker_prior is not None:
+            self.worker_prior = np.array(worker_prior)
+            if self.worker_prior.ndim == 2:
+                self.worker_prior = np.tile(self.worker_prior[np.newaxis, :, :], [self.U, 1, 1])
+            self.C = self.worker_prior.shape[1]
+            self.R = self.worker_prior.shape[2]
+        else:
+            if vote_ids is not None:
+                self.C = len(vote_ids)
+                self.R = self.C
+            else:
+                self.R = len(np.unique(votes))
+                self.C = self.R
+            self.worker_prior = (np.eye(self.R) + np.ones((self.R, self.R)) / self.R) * 3
+            self.worker_prior = np.tile(self.worker_prior[np.newaxis, :, :], [self.U, 1, 1])
+        if instance_prior is None:
+            self.instance_prior = np.ones(self.C) / self.C / 4
+        else:
+            self.instance_prior = instance_prior
+
+        # determine vote IDs
+        if vote_ids is None:
+            self.vote_ids = np.unique(votes)
+            vote_dict = {y: x for x, y in enumerate(self.vote_ids)}
+            votes = np.array([vote_dict[x] for x in votes])
+        else:
+            self.vote_ids = vote_ids
+        # determine instance IDs
+        if instance_ids is None:
+            self.instance_ids = np.unique(instances)
+            instance_dict = {y: x for x, y in enumerate(self.instance_ids)}
+            instances = np.array([instance_dict[x] for x in instances])
+        else:
+            self.instance_ids = instance_ids
+        # determine worker IDs
+        if worker_ids is None:
+            self.worker_ids = np.unique(workers)
+            worker_dict = {y: x for x, y in enumerate(self.worker_ids)}
+            workers = np.array([worker_dict[x] for x in workers])
+        else:
+            self.worker_ids = worker_ids
+
+        # cl_transform info
+        if not isinstance(transform, str) and hasattr(transform, '__iter__'):
+            self.transform = tuple(transform)
+        else:
+            self.transform = (transform,)
+
+        # Gibbs sampling parameters
+        self.num_epochs = num_epochs
+        self.burn_in = burn_in
+        self.num_samples = num_epochs - burn_in
+
+        # info to save
+        self.LL = np.nan * np.ones(self.num_epochs)
+        self.worker_mats = np.zeros((self.U, self.C, self.R))
+        self.labels, self.labels_cov = list(), list()
+        for transform in self.transform:
+            if transform in (None, 'none', 'clr'):
+                self.labels.append(np.zeros((self.I, self.C)))
+                self.labels_cov.append(np.zeros((self.I, self.C, self.C)))
+            elif transform in ('alr', 'ilr'):
+                self.labels.append(np.zeros((self.I, self.C - 1)))
+                self.labels_cov.append(np.zeros((self.I, self.C - 1, self.C - 1)))
+            else:
+                raise Exception('Unknown transform!')
+        if save_samples:
+            self.samples = np.zeros((self.num_epochs - self.burn_in, self.I, self.C - 1))
+        self.updateable = updateable
+        self.vote_classes = None
+
+        # estimate label means and covariances using cllda
+        self.cllda(votes, workers, instances)
+
+        # clean up
+        if not self.updateable:
+            self.vote_classes = None
+        else:
+            self.votes = votes
+            self.instances = instances
+            self.workers = workers
+
+    # CLLDA optimization using Gibbs sampling
+    def cllda(self, votes, workers, instances, starting_epoch=0):
+        """
+        Performs inference on the :class: CLLDA model.
+
+        :param votes: List of vote values.
+        :param workers: List of workers who submitted :param votes.
+        :param instances: List of instances to which the :param votes pertain.
+        :param starting_epoch: How many epochs have already been incorporated in the averages.
+        """
+
+        # precalculate
+        worker_prior_sum = self.worker_prior.sum(axis=2)
+        instance_prior_sum = self.instance_prior.sum()
+
+        # initial estimates
+        if self.vote_classes is None:
+            if self.C == self.R:
+                self.vote_classes = votes.copy()
+            else:
+                self.vote_classes = self.rng.randint(0, self.C, self.V)
+
+        # calculate vote weights
+        temp = np.vstack((workers, instances)).T
+        temp = np.ascontiguousarray(temp).view(np.dtype((np.void, temp.dtype.itemsize * temp.shape[1])))
+        _, unique_counts = np.unique(temp, return_counts=True)
+        weights = 1. / unique_counts[instances]  # type: np.ndarray
+
+        # initial counts
+        counts_across_images = np.zeros(shape=(self.U, self.C, self.R))
+        counts_across_workers_and_votes = np.zeros(shape=(self.I, self.C))
+        for it_v in range(self.V):
+            counts_across_images[workers[it_v], self.vote_classes[it_v], votes[it_v]] += weights[it_v]
+            counts_across_workers_and_votes[instances[it_v], self.vote_classes[it_v]] += weights[it_v]
+        counts_across_images_and_votes = counts_across_images.sum(axis=2)
+
+        # set cl_transform
+        transform = list()
+        for tfm in self.transform:
+            if tfm in (None, 'none'):
+                transform.append(self.identity)
+            elif tfm == 'clr':
+                transform.append(clr)
+            elif tfm == 'alr':
+                transform.append(alr)
+            elif tfm == 'ilr':
+                transform.append(lambda comp: ilr(comp, mpm(self.C)))
+
+        # LDA functions
+        def get_data_like():
+            like = np.zeros(self.V)
+            for it_v in range(self.V):
+                i = instances[it_v]
+                k = self.vote_classes[it_v]
+                u = workers[it_v]
+                v = votes[it_v]
+                w = weights[it_v]  # type: np.ndarray
+                like[it_v] = (counts_across_workers_and_votes[i, k] - w + self.instance_prior[k]) \
+                             * (counts_across_images[u, k, v] - w + self.worker_prior[u, k, v]) \
+                             / (counts_across_images_and_votes[u, k] - w + worker_prior_sum[u, k])
+            return np.log(like).sum()
+
+        def get_label_prob():
+            like = (counts_across_workers_and_votes[i, :] + self.instance_prior[:]) \
+                   * (counts_across_images[u, :, v] + self.worker_prior[u, :, v]) \
+                   / (counts_across_images_and_votes[u, :] + worker_prior_sum[u, :])
+            return like / like.sum()
+
+        def update_labels():
+            # create update
+            numerator = counts_across_workers_and_votes + self.instance_prior
+            denominator = counts_across_workers_and_votes.sum(axis=1) + instance_prior_sum
+            update = numerator / denominator[:, np.newaxis]
+            for it, tfm in enumerate(transform):
+                tfmupdate = tfm(update)
+                if hasattr(self, 'samples'):
+                    self.samples[ep - self.burn_in, :, :] = tfmupdate
+                # update labels
+                delta = (tfmupdate - self.labels[it]) / (ep - self.burn_in + 1)
+                self.labels[it] += delta
+                # update labels_M2
+                delta_cov = delta[:, :, np.newaxis] * delta[:, :, np.newaxis].transpose(0, 2, 1)
+                self.labels_cov[it] += (ep - self.burn_in) * delta_cov - self.labels_cov[it] / (ep - self.burn_in + 1)
+
+        def update_worker_mats():
+            # create update
+            numerator = counts_across_images + self.worker_prior
+            denominator = counts_across_images.sum(axis=2) + worker_prior_sum
+            update = numerator / denominator[:, :, np.newaxis]
+            # update labels
+            delta = (update - self.worker_mats) / (ep - self.burn_in + 1)
+            self.worker_mats += delta
+
+        # CLLDA
+        start = time()
+        for ep in range(starting_epoch, starting_epoch + self.num_epochs):
+            # begin epoch
+            print('starting epoch ' + str(ep + 1))
+            if ep > starting_epoch:
+                time_to_go = (time() - start) * (self.num_epochs - ep) / ep
+                if time_to_go >= 3600:
+                    print('Estimated time to finish: %.2f hours' % (time_to_go / 3600,))
+                elif time_to_go >= 60:
+                    print('Estimated time to finish: %.1f minutes' % (time_to_go / 60,))
+                else:
+                    print('Estimated time to finish: %.1f seconds' % (time_to_go,))
+            ep_start = time()
+
+            # gibbs sampling
+            for it_v in self.rng.permutation(self.V).astype(np.int64):
+                # get correct indices
+                i = instances[it_v]
+                k = self.vote_classes[it_v]
+                u = workers[it_v]
+                v = votes[it_v]
+                w = weights[it_v]
+                # decrement counts
+                counts_across_images[u, k, v] -= w
+                counts_across_workers_and_votes[i, k] -= w
+                counts_across_images_and_votes[u, k] -= w
+                # calculate probabilities of labels for this vote
+                probs = get_label_prob()
+                # sample new label
+                k = self.rng.multinomial(1, probs).argmax()
+                self.vote_classes[it_v] = k
+                # increment counts
+                counts_across_images[u, k, v] += w
+                counts_across_workers_and_votes[i, k] += w
+                counts_across_images_and_votes[u, k] += w
+
+            # save information
+            self.LL[ep] = get_data_like()
+            if ep >= self.burn_in + starting_epoch:
+                update_labels()
+                update_worker_mats()
+
+            # print epoch LL and duration
+            print('Epoch completed in %.1f seconds' % (time() - ep_start,))
+            print('LL: %.6f' % (self.LL[ep]))
+
+        # adjust label covariances
+        self.labels_cov = [x * self.num_samples / (self.num_samples - 1.) for x in self.labels_cov]
+
+        time_total = time() - start
+        if time_total >= 3600:
+            print('CLLDA completed in %.2f hours' % (time_total / 3600,))
+        elif time_total >= 60:
+            print('CLLDA completed in %.1f minutes' % (time_total / 60,))
+        else:
+            print('CLLDA completed in %.1f seconds' % (time_total,))
+
+    #
+    def update(self, votes, workers, instances, vote_ids=None, instance_ids=None, worker_ids=None, worker_prior=None,
+               num_epochs=1000, burn_in=200):
+
+        # check that this is updateble
+        assert self.updateable, 'This model is not updateable, presumable to conserve memory.'
+
+        # determine IDs
+        # for votes
+        old_vote_ids = self.vote_ids.copy()  # type: np.ndarray
+        if vote_ids is None:
+            self.vote_ids = np.unique(votes)
+            vote_dict = {y: x for x, y in enumerate(self.vote_ids)}
+            votes = np.array([vote_dict[x] for x in votes])
+        else:
+            self.vote_ids = vote_ids
+        # for instances
+        old_instance_ids = self.instance_ids.copy()  # type: np.ndarray
+        if instance_ids is None:
+            self.instance_ids = np.unique(instances)
+            instance_dict = {y: x for x, y in enumerate(self.instance_ids)}
+            instances = np.array([instance_dict[x] for x in instances])
+        else:
+            self.instance_ids = instance_ids
+        # for workers
+        old_worker_ids = self.worker_ids.copy()  # type: np.ndarray
+        if worker_ids is None:
+            self.worker_ids = np.unique(workers)
+            worker_dict = {y: x for x, y in enumerate(self.worker_ids)}
+            workers = np.array([worker_dict[x] for x in workers])
+        else:
+            self.worker_ids = worker_ids
+
+        # update parameters
+        self.V = len(votes)
+        self.U = len(np.unique(workers))
+        self.I = len(np.unique(instances))
+        self.num_epochs = num_epochs
+        self.burn_in = burn_in
+
+        # add more samples to previous solution
+        if np.array_equal(votes, self.votes) and np.array_equal(workers, self.workers) \
+                and np.array_equal(instances, self.instances) and np.array_equal(self.vote_ids, old_vote_ids) \
+                and np.array_equal(self.instance_ids, old_instance_ids) \
+                and np.array_equal(self.worker_ids, old_worker_ids):
+            # adjust label covariances
+            self.labels_cov = [x * (self.num_samples - 1.) / self.num_samples for x in self.labels_cov]
+
+            # update parameters
+            self.LL = np.concatenate((self.LL, np.zeros(num_epochs)))
+            old_num_samples = self.num_samples
+            self.num_samples += num_epochs - burn_in
+            self.votes = votes
+            self.workers = workers
+            self.instances = instances
+
+            # update cllda
+            self.cllda(votes, workers, instances, old_num_samples - 1)
+
+        # keep only vote-classes and build off of them
+        else:
+            # insert old vote-classes and initialize new vote-classes
+            old_vote_classes = self.vote_classes.copy()
+            self.vote_classes = np.zeros_like(votes)
+            old_dict = {y: x for x, y in enumerate(zip(self.votes, self.workers, self.instances))}
+            for it, index in enumerate(zip(votes, workers, instances)):
+                try:
+                    self.vote_classes[it] = old_vote_classes[old_dict[index]]
+                except KeyError:
+                    if self.C == self.R:
+                        self.vote_classes[it] = votes[it]
+                    else:
+                        self.vote_classes[it] = self.rng.randint(self.C)
+
+            # adjust worker_prior if necessary
+            if not np.array_equal(self.worker_ids, old_worker_ids):
+                assert worker_prior is not None, "Worker priors must be provided if worker_ids change."
+                self.worker_prior = np.array(worker_prior)
+                if self.worker_prior.ndim == 2:
+                    self.worker_prior = np.tile(self.worker_prior[np.newaxis, :, :], [self.U, 1, 1])
+
+            # adjust info to save
+            self.worker_mats = np.zeros((self.U, self.C, self.R))
+            self.labels, self.labels_cov = list(), list()
+            for transform in self.transform:
+                if transform in (None, 'none', 'clr'):
+                    self.labels.append(np.zeros((self.I, self.C)))
+                    self.labels_cov.append(np.zeros((self.I, self.C, self.C)))
+                elif transform in ('alr', 'ilr'):
+                    self.labels.append(np.zeros((self.I, self.C - 1)))
+                    self.labels_cov.append(np.zeros((self.I, self.C - 1, self.C - 1)))
+                else:
+                    raise Exception('Unknown transform!')
+
+            # update parameters
+            self.LL = np.zeros(num_epochs)
+            self.num_samples = num_epochs
+
+            # update cllda
+            self.cllda(votes, workers, instances)
+            self.votes = votes
+            self.instances = instances
+            self.workers = workers
+
+    # no cl_transform
+    @staticmethod
+    def identity(compositional):
+        return compositional
+
+
+def concurrent_cllda(models, votes, workers, instances, nprocs=4, **kwargs):
+    """
+    Effortless parallelization of multiple CLLDA models.
+
+    :param models: If creating new models, an integer denoting how many models to create.
+        Otherwise, a list of existing models to update.
+    :param votes: List of vote values.
+    :param workers: List of uses who submitted :param votes.
+    :param instances: List of instances to which the :param votes pertain.
+    :param nprocs: Number of processors to use in the parallel pool.
+    :param kwargs: Other possible inputs to either CLLDA.__init__ or CLLDA.update
+    :return: List of new or updated CLLDA models.
+    """
+
+    # open parallel pool
+    print('Starting multiprocessing pool...')
+    pool = mp.Pool(processes=nprocs)
+
+    # run CL-LDA
+    if isinstance(models, int):
+        print('Starting new CL-LDA models in parallel...')
+        if 'seed' in kwargs.keys():
+            np.random.seed(kwargs['seed'])
+        kwargs = [deepcopy(kwargs) for x in range(models)]
+        for it in range(models):
+            kwargs[it]['seed'] = np.random.randint(int(1e8))
+        out = pool.map(_new_cllda, [(votes, workers, instances, kwa) for kwa in kwargs])
+
+    elif hasattr(models, '__iter__'):
+        print('Updating CL-LDA models in parallel...')
+        out = pool.map(_update_cllda, [(model, votes, workers, instances, kwargs) for model in models])
+
+    else:
+        pool.close()
+        TypeError('Unknown type for input: models.')
+
+    # close parallel pool
+    pool.close()
+    print('Multiprocessing pool closed.')
+
+    return out
+
+
+def combine_cllda(models):
+    """
+    Combine multiple CLLDA instances.
+    :param models: List of CLLDA models trained with the same settings.
+    :return: CLLDA model which combines the input models.
+    """
+
+    # check models are equivalent
+    assert np.equal(models[0].V, [model.V for model in models[1:]]).any(), 'Different number of votes!'
+    assert np.equal(models[0].U, [model.U for model in models[1:]]).any(), 'Different number of workers!'
+    assert np.equal(models[0].I, [model.I for model in models[1:]]).any(), 'Different number of instances!'
+    assert np.equal(models[0].C, [model.C for model in models[1:]]).any(), 'Different number of classes!'
+    assert np.equal(models[0].R, [model.R for model in models[1:]]).any(), 'Different number of responses!'
+    assert np.equal(models[0].worker_prior,
+                    [model.worker_prior for model in models[1:]]).any(), 'Different worker priors!'
+    assert np.equal(models[0].instance_prior,
+                    [model.instance_prior for model in models[1:]]).any(), 'Different instance priors!'
+    assert np.all([models[0].transform == model.transform for model in models[1:]]), 'Different transforms!'
+
+    # data info
+    out = deepcopy((models[0]))
+
+    # combine label estimates
+    out.num_samples = np.sum([model.num_samples for model in models])
+
+    # combine worker estimates
+    out.worker_mats = np.sum([model.worker_mats * model.num_samples for model in models],
+                             axis=0) / out.num_samples
+
+    if all([x.updateable for x in models]):
+        out.vote_classes = mode(np.stack([x.vote_classes for x in models]))[0].flatten()
+
+    # combine labels and label covariances
+    for it in range(len(models[0].transform)):
+        out.labels[it] = np.sum([model.labels[it] * model.num_samples for model in models], 0) / out.num_samples
+        labels_corrmat = [(model.num_samples - 1.) / model.num_samples * model.labels_cov[it]
+                          + model.labels[it][..., np.newaxis] * model.labels[it][..., np.newaxis].transpose(0, 2, 1)
+                          for model in models]
+        out.labels_cov[it] = np.sum([corrmat * model.num_samples for model, corrmat in zip(models, labels_corrmat)],
+                                    0) \
+                             / out.num_samples - out.labels[it][..., np.newaxis] * out.labels[it][
+            ..., np.newaxis].transpose(0, 2, 1)
+
+        # adjust label covariances
+        out.labels_cov[it] *= out.num_samples / (out.num_samples - 1.)
+
+    return out
+
+
+# map function
+def _new_cllda(inputs):
+    return CLLDA(*inputs[:3], **inputs[3])
+
+
+# map function
+def _update_cllda(inputs):
+    inputs[0].update(*inputs[1:4], **inputs[4])
+    return inputs[0]
+
+
+# if __name__ == '__main__':
+#     # test suite
+#     from DS import DS
+#     test_data = DS.test_data()
+#     CLLDA(test_data[0], test_data[1], test_data[2], num_epochs=10, burn_in=2, transform=('none', 'alr', 'ilr', 'clr'))
+#     cls = concurrent_cllda(4, test_data[0], test_data[1], test_data[2],
+#                            num_epochs=10, burn_in=2, transform=('none', 'alr', 'ilr', 'clr'))
+#     cl = combine_cllda(cls)
+#     a=1

+ 309 - 0
labels/crowd_labeling/DS.py

@@ -0,0 +1,309 @@
+import numpy as np
+from time import time
+
+
+class DS:
+
+    def __init__(self, votes=None, workers=None, instances=None, test=None,
+                 transform=None, classes=None, num_epochs=1000):
+
+        if votes is None or workers is None or instances is None:
+            votes, workers, instances, test = self.test_data()
+
+        # data info
+        self.V = len(votes)
+        self.U = len(np.unique(workers))
+        self.I = len(np.unique(instances))
+        if classes is not None:
+            self.C = len(classes)
+        else:
+            self.C = len(np.unique(votes))
+        self.transform = transform
+        self.instance_prior = np.zeros(self.C)
+        self.eps = np.finfo(np.float64).eps
+
+        # EM parameters
+        self.max_epochs = num_epochs
+
+        # info to save
+        self.LL = np.nan * np.ones(self.max_epochs)
+        if test is not None:
+            self.accuracy = np.nan * np.ones(self.max_epochs)
+        self.labels = np.zeros((self.I, self.C))
+        # self.labels_cov = np.zeros((self.I, self.C, self.C))
+        self.worker_skill = np.zeros((self.U, self.C, self.C))
+
+        # estimate label means and covariances using ds
+        self.ds(votes, workers, instances, test)
+
+        # apply transform
+        if transform == 'clr':
+
+            def clr(self):
+                continuous = np.log(self.labels + self.eps)
+                continuous -= continuous.mean(1, keepdims=True)
+                return continuous
+
+            self.labels = clr(self)
+
+        elif transform == 'alr':
+
+            def alr(self):
+                continuous = np.log(self.labels[:, :-1] / (self.labels[:, -1] + self.eps))
+                return continuous
+
+            self.labels = alr(self)
+
+        elif transform == 'ilr':
+
+            # make projection matrix
+            self.projectionMatrix = np.zeros((self.C, self.C - 1), dtype=np.float32)
+            for it in range(self.C - 1):
+                i = it + 1
+                self.projectionMatrix[:i, it] = 1. / i
+                self.projectionMatrix[i, it] = -1
+                self.projectionMatrix[i + 1:, it] = 0
+                self.projectionMatrix[:, it] *= np.sqrt(i / (i + 1.))
+
+            def ilr(self):
+                continuous = np.log(self.labels + self.eps)
+                continuous -= continuous.mean(1, keepdims=True)
+                continuous = np.dot(continuous, self.projectionMatrix)
+                return continuous
+
+            self.labels = ilr(self)
+
+
+        # # adjust label covariances and inverses
+        # self.__estimate_label_cov_with_inv_wishart()
+        # self.labels_cov *= (self.num_samples + 1) / self.num_samples
+
+    # EM functions
+    def calculate_log_like(self, worker_ind, vote_ind):
+        # calculate log-likelihood (2.7 is DS)
+        LL = 0
+        for i in range(self.I):
+            LL += np.log((self.worker_skill[worker_ind[i], :, vote_ind[i]].prod(0) * self.instance_prior).sum())
+        LL /= self.I
+        return LL
+
+    # estimate the instance classes given the current parameters (these labels are treated as missing data here for EM)
+    def e_step(self, worker_ind, vote_ind):
+        # estimate instance classes (2.5 is DS)
+        for i in range(self.I):
+            self.labels[i, :] = self.worker_skill[worker_ind[i], :, vote_ind[i]].prod(0)
+        self.labels *= self.instance_prior[np.newaxis, :]
+        self.labels /= self.labels.sum(1, keepdims=True) + self.eps
+
+    # update parameters to maximize the data likelihood
+    def m_step(self, instance_ind):
+        # argmax LL over class probabilities (2.4 is DS)
+        self.instance_prior = self.labels.mean(0) + self.eps
+        # argmax LL over worker skill (2.3 is DS)
+        for u in range(self.U):
+            for c in range(self.C):
+                self.worker_skill[u, :, c] = self.labels[instance_ind[u][c], :].sum(0)
+        self.worker_skill /= self.worker_skill.sum(2, keepdims=True) + self.eps
+
+    # DS optimization using EM
+    def ds(self, votes, workers, instances, test):
+        # precalculate indices
+        print('Generating indices...')
+        worker_ind, vote_ind, instance_ind = [], [], []
+        for i in range(self.I):
+            _instance_ind = instances == i
+            worker_ind.append(workers[_instance_ind])
+            vote_ind.append(votes[_instance_ind])
+        for u in range(self.U):
+            instance_ind.append([])
+            _worker_ind = workers == u
+            for c in range(self.C):
+                _vote_ind = votes == c
+                instance_ind[u].append(instances[np.bitwise_and(_worker_ind, _vote_ind)])
+
+        # DS
+        start = time()
+        for ep in range(self.max_epochs):
+            # begin epoch
+            print('starting epoch ' + str(ep + 1))
+            if ep:
+                time_to_go = (time() - start) * (self.max_epochs - ep) / ep
+                if time_to_go >= 3600:
+                    print('Estimated time to finish: %.2f hours' % (time_to_go / 3600,))
+                elif time_to_go >= 60:
+                    print('Estimated time to finish: %.2f minutes' % (time_to_go / 60,))
+                else:
+                    print('Estimated time to finish: %.1f seconds' % (time_to_go,))
+            ep_start = time()
+
+            # EM
+            print('E step...')
+            if not ep:
+                # initial estimates
+                for i in range(self.I):
+                    ind = instances == i
+                    for c in range(self.C):
+                        self.labels[i, c] = np.count_nonzero(votes[ind] == c)
+                self.labels /= self.labels.sum(1, keepdims=True) + self.eps
+                # self.labels[np.random.choice(self.I, self.I/2, replace=False), :] = 1. / self.C
+            else:
+                self.e_step(worker_ind, vote_ind)
+            print('M step...')
+            self.m_step(instance_ind)
+
+            # save information
+            print('Calculating log-likelihood...')
+            self.LL[ep] = self.calculate_log_like(worker_ind, vote_ind)
+            print('Log-likelihood = %f' % (self.LL[ep],))
+
+            # evaulation if test available
+            if test is not None:
+                self.accuracy[ep] = (self.labels.argmax(1) == test).mean()
+                print('Accuracy = %f' % (self.accuracy[ep],))
+                # ce = -np.log(self.labels[range(self.I), test] + self.eps).sum()
+                # print 'Cross Entropy = %f' % (ce,)
+
+            # print epoch duration
+            print('Epoch completed in %.1f seconds' % (time() - ep_start,))
+
+        time_total = time() - start
+        if time_total >= 3600:
+            print('DS completed in %.2f hours' % (time_total / 3600,))
+        elif time_total >= 60:
+            print('DS completed in %.2f minutes' % (time_total / 60,))
+        else:
+            print('DS completed in %.1f seconds' % (time_total,))
+
+    # # generate covariance estimates using inverse Wishart prior
+    # def __estimate_label_cov_with_inv_wishart(self):
+    #     # prepare parameters
+    #     self.inv_wishart_prior_scatter = 0.1 * np.eye(self.C - 1) * self.num_samples
+    #     self.inv_wishart_degrees_of_freedom = self.C - 1
+    #     scatter_matrix = self.labels_cov * self.num_samples
+    #
+    #     # calculate multivariate student-t covariance based on normal with known mean and inverse Wishart prior
+    #     self.labels_cov_iwp = (self.inv_wishart_prior_scatter + scatter_matrix) \
+    #                           / (self.inv_wishart_degrees_of_freedom + self.num_samples - (self.C - 1) - 1)
+    #
+    #     # calculate covariance inverses for later use
+    #     self.labels_icov_iwp = np.linalg.inv(self.labels_cov_iwp )
+
+    @staticmethod
+    def test_data():
+        """
+        Sample data from the Dawid & Skene (1979) paper
+        :return: (votes, workers, instances, true_class)
+        """
+        # data from DS section 4
+        data = [[[1, 1, 1], 1, 1, 1, 1],
+                [[3, 3, 3], 4, 3, 3, 4],
+                [[1, 1, 2], 2, 1, 2, 2],
+                [[2, 2, 2], 3, 1, 2, 1],
+                [[2, 2, 2], 3, 2, 2, 2],
+                [[2, 2, 2], 3, 3, 2, 2],
+                [[1, 2, 2], 2, 1, 1, 1],
+                [[3, 3, 3], 3, 4, 3, 3],
+                [[2, 2, 2], 2, 2, 2, 3],
+                [[2, 3, 2], 2, 2, 2, 3],
+                [[4, 4, 4], 4, 4, 4, 4],
+                [[2, 2, 2], 3, 3, 4, 3],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[2, 2, 2], 3, 2, 1, 2],
+                [[1, 2, 1], 1, 1, 1, 1],
+                [[1, 1, 1], 2, 1, 1, 1],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[2, 2, 2], 2, 2, 2, 1],
+                [[2, 2, 2], 1, 3, 2, 2],
+                [[2, 2, 2], 2, 2, 2, 2],
+                [[2, 2, 2], 2, 2, 2, 1],
+                [[2, 2, 2], 3, 2, 2, 2],
+                [[2, 2, 1], 2, 2, 2, 2],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[2, 3, 2], 2, 2, 2, 2],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[1, 1, 2], 1, 1, 2, 1],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[3, 3, 3], 3, 2, 3, 3],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[2, 2, 2], 2, 2, 2, 2],
+                [[2, 2, 2], 3, 2, 3, 2],
+                [[4, 3, 3], 4, 3, 4, 3],
+                [[2, 2, 1], 2, 2, 3, 2],
+                [[2, 3, 2], 3, 2, 3, 3],
+                [[3, 3, 3], 3, 4, 3, 2],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[1, 1, 1], 1, 1, 1, 1],
+                [[1, 2, 1], 2, 1, 1, 1],
+                [[2, 3, 2], 2, 2, 2, 2],
+                [[1, 2, 1], 1, 1, 1, 1],
+                [[2, 2, 2], 2, 2, 2, 2]]
+
+        # solutions from DS section 4
+        test = [1,
+                4,
+                2,
+                2,
+                2,
+                2,
+                1,
+                3,
+                2,
+                2,
+                4,
+                3,
+                1,
+                2,
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                2,
+                2,
+                1,
+                1,
+                2,
+                1,
+                1,
+                1,
+                1,
+                3,
+                1,
+                2,
+                2,
+                4,
+                2,
+                3,
+                3,
+                1,
+                1,
+                1,
+                2,
+                1,
+                2]
+
+        # cl_transform to list format
+        votes, workers, instances = [], [], []
+        for it_patient, patient in enumerate(data):
+            for it_doctor, doctor in enumerate(patient):
+                if isinstance(doctor, list):
+                    for diagnosis in doctor:
+                        votes.append(diagnosis-1)
+                        workers.append(it_doctor)
+                        instances.append(it_patient)
+                else:
+                    votes.append(doctor-1)
+                    workers.append(it_doctor)
+                    instances.append(it_patient)
+
+        return np.array(votes), np.array(workers), np.array(instances), np.array(test) - 1
+
+
+if __name__ == '__main__':
+    ds = DS()

+ 77 - 0
labels/crowd_labeling/MV.py

@@ -0,0 +1,77 @@
+import numpy as np
+
+
+class MV:
+
+    def __init__(self, votes=None, workers=None, instances=None, transform=None, classes=None):
+
+        # data info
+        self.V = len(votes)
+        self.U = len(np.unique(workers))
+        self.I = len(np.unique(instances))
+        if classes is not None:
+            self.C = len(classes)
+        else:
+            self.C = len(np.unique(votes))
+        self.transform = transform
+        self.eps = np.finfo(np.float64).eps
+
+        # info to save
+        self.labels = np.zeros((self.I, self.C))
+
+        # estimate label means and covariances using ds
+        self.mv(votes, workers, instances)
+
+        # apply transform
+        if transform == 'clr':
+
+            def clr(self):
+                continuous = np.log(self.labels + self.eps)
+                continuous -= continuous.mean(1, keepdims=True)
+                return continuous
+
+            self.labels = clr(self)
+
+        elif transform == 'alr':
+
+            def alr(self):
+                continuous = np.log(self.labels[:, :-1] / (self.labels[:, -1] + self.eps))
+                return continuous
+
+            self.labels = alr(self)
+
+        elif transform == 'ilr':
+
+            # make projection matrix
+            self.projectionMatrix = np.zeros((self.C, self.C - 1), dtype=np.float32)
+            for it in range(self.C - 1):
+                i = it + 1
+                self.projectionMatrix[:i, it] = 1. / i
+                self.projectionMatrix[i, it] = -1
+                self.projectionMatrix[i + 1:, it] = 0
+                self.projectionMatrix[:, it] *= np.sqrt(i / (i + 1.))
+
+            def ilr(self):
+                continuous = np.log(self.labels + self.eps)
+                continuous -= continuous.mean(1, keepdims=True)
+                continuous = np.dot(continuous, self.projectionMatrix)
+                return continuous
+
+            self.labels = ilr(self)
+
+    # DS optimization using EM
+    def mv(self, votes, workers, instances):
+
+        # vote weights
+        temp = np.vstack((workers, instances)).T
+        temp = np.ascontiguousarray(temp).view(np.dtype((np.void, temp.dtype.itemsize * temp.shape[1])))
+        _, unique_counts = np.unique(temp, return_counts=True)
+        weights = 1. / unique_counts[instances]
+
+        # initial estimates
+        for i in range(self.I):
+            ind = instances == i
+            for c in range(self.C):
+                self.labels[i, c] = ((votes[ind] == c) * weights[ind]).sum()
+        self.labels /= self.labels.sum(1, keepdims=True) + self.eps
+

+ 13 - 0
labels/crowd_labeling/README.md

@@ -0,0 +1,13 @@
+# Crowd Labeling
+Code to perform crowd labeling.
+
+#### Supported languages:
+* Python
+* MATLAB (planned)
+
+#### Supported algorithms:
+* Majority Vote
+* Dawid & Skene
+* Crowd Labeling Latent Dirichlet Allocation
+
+Pull requests are welcome.

+ 4 - 0
labels/crowd_labeling/__init__.py

@@ -0,0 +1,4 @@
+from .MV import MV
+from .DS import DS
+from .CLLDA import CLLDA, combine_cllda, concurrent_cllda
+from . import logratio_transformations

+ 205 - 0
labels/crowd_labeling/logratio_transformations.py

@@ -0,0 +1,205 @@
+import numpy as np
+
+
+# numpy functions
+
+# additive log-ratio cl_transform
+def additive_log_ratio_transform(compositional):
+    """Applies the additive log-ratio transform to compositional data."""
+    compositional = compositional[:] + np.finfo(compositional.dtype).eps
+    continuous = np.log(compositional[..., :-1] / compositional[..., -1, np.newaxis])
+    return continuous
+
+
+# inverse additive log-ratio cl_transform
+def inverse_additive_log_ratio_transform(continuous):
+    """Inverts the additive log-ratio transform, producing compositional data."""
+    n = continuous.shape[0]
+    compositional = np.hstack((np.exp(continuous), np.ones((n, 1))))
+    compositional /= compositional.sum(axis=-1, keepdims=1)
+    return compositional
+
+
+# centered log-ratio cl_transform
+def centered_log_ratio_transform(compositional):
+    """Applies the centered log-ratio transform to compositional data."""
+    continuous = np.log(compositional + np.finfo(compositional.dtype).eps)
+    continuous -= continuous.mean(-1, keepdims=True)
+    return continuous
+
+
+# inverse centered log-ratio cl_transform
+def inverse_centered_log_ratio_transform(continuous):
+    """Inverts the centered log-ratio transform, producing compositional data."""
+    compositional = np.exp(continuous)
+    compositional /= compositional.sum(axis=-1, keepdims=1)
+    return compositional
+
+
+# isometric log-ratio cl_transform
+def isometric_log_ratio_transform(compositional, projection_matrix):
+    """Applies the isometric log-ratio transform to compositional data."""
+    continuous = centered_log_ratio_transform(compositional)
+    continuous = np.dot(continuous, projection_matrix)
+    return continuous
+
+
+# inverse isometric log-ratio cl_transform
+def inverse_isometric_log_ratio_transform(continuous, projection_matrix):
+    """Inverts the isometric log-ratio transform, producing compositional data."""
+    continuous = np.dot(continuous, projection_matrix.T)
+    compositional = inverse_centered_log_ratio_transform(continuous)
+    return compositional
+
+
+# isometric log-ratio cl_transform
+def easy_isometric_log_ratio_transform(compositional):
+    """Applies the isometric log-ratio transform to compositional data."""
+    continuous = centered_log_ratio_transform(compositional)
+    projection_matrix = make_projection_matrix(continuous.shape[1])
+    continuous = np.dot(continuous, projection_matrix)
+    return continuous
+
+
+# inverse isometric log-ratio cl_transform
+def easy_inverse_isometric_log_ratio_transform(continuous):
+    """Inverts the isometric log-ratio transform, producing compositional data."""
+    projection_matrix = make_projection_matrix(continuous.shape[1] + 1)
+    continuous = np.dot(continuous, projection_matrix.T)
+    compositional = inverse_centered_log_ratio_transform(continuous)
+    return compositional
+
+
+# projection matrix for isometric log-ratio cl_transform
+def make_projection_matrix(dimension):
+    """Creates the projection matrix for the  the isometric log-ratio transform."""
+    projection_matrix = np.zeros((dimension, dimension - 1), dtype=np.float32)
+    for it in range(dimension - 1):
+        i = it + 1
+        projection_matrix[:i, it] = 1. / i
+        projection_matrix[i, it] = -1
+        projection_matrix[i + 1:, it] = 0
+        projection_matrix[:, it] *= np.sqrt(i / (i + 1.))
+    return projection_matrix
+
+
+# theano functions
+eps = np.finfo(np.float32).eps
+
+
+# additive log-ratio cl_transform
+def theano_additive_log_ratio_transform(compositional):
+    """Applies the additive log-ratio transform to compositional data."""
+    from theano import tensor as T
+    compositional = compositional[:] + eps
+    continuous = T.log(compositional[..., :-1] /
+                       compositional[..., -1].reshape(compositional.shape[:-1] + (1,)))
+    return continuous
+
+
+# inverse additive log-ratio cl_transform
+def theano_inverse_additive_log_ratio_transform(continuous):
+    """Inverts the additive log-ratio transform, producing compositional data."""
+    from theano import tensor as T
+    compositional = T.stack((T.exp(continuous), T.ones((continuous.shape[0], 1))), axis=continuous.ndim - 1)
+    compositional /= compositional.sum(axis=-1, keepdims=1)
+    return compositional
+
+
+# centered log-ratio cl_transform
+def theano_centered_log_ratio_transform(compositional):
+    """Applies the centered log-ratio transform to compositional data."""
+    from theano import tensor as T
+    compositional = compositional[:] + eps
+    continuous = T.log(compositional)
+    continuous -= continuous.mean(-1, keepdims=True)
+    return continuous
+
+
+# inverse centered log-ratio cl_transform
+def theano_inverse_centered_log_ratio_transform(continuous):
+    """Inverts the centered log-ratio transform, producing compositional data."""
+    from theano import tensor as T
+    compositional = T.exp(continuous)
+    compositional /= compositional.sum(axis=-1, keepdims=1)
+    return compositional
+
+
+# isometric log-ratio cl_transform
+def theano_isometric_log_ratio_transform(compositional, projection_matrix):
+    """Applies the isometric log-ratio transform to compositional data."""
+    from theano import tensor as T
+    continuous = theano_centered_log_ratio_transform(compositional)
+    continuous = T.dot(continuous, projection_matrix)
+    return continuous
+
+
+# inverse isometric log-ratio cl_transform
+def theano_inverse_isometric_log_ratio_transform(continuous, projection_matrix):
+    """Inverts the isometric log-ratio transform, producing compositional data."""
+    from theano import tensor as T
+    continuous = T.dot(continuous, projection_matrix.T)
+    compositional = theano_inverse_centered_log_ratio_transform(continuous)
+    return compositional
+
+
+# tensorflow functions
+
+# additive log-ratio cl_transform
+def tf_additive_log_ratio_transform(compositional, name='alrt'):
+    """Applies the additive log-ratio transform to compositional data."""
+    import tensorflow as tf
+    compositional = compositional + eps
+    continuous = tf.log(compositional[..., :-1] /
+                       compositional[..., -1].reshape(compositional.shape[:-1] + (1,)), name=name)
+    return continuous
+
+
+# inverse additive log-ratio cl_transform
+def tf_inverse_additive_log_ratio_transform(continuous, name='ialrt'):
+    """Inverts the additive log-ratio transform, producing compositional data."""
+    import tensorflow as tf
+    compositional = tf.stack((tf.exp(continuous), tf.ones((continuous.shape[0], 1))), axis=tf.get_shape(continuous).ndim - 1)
+    compositional /= tf.reduce_sum(compositional, axis=-1, keep_dims=True, name=name)
+    return compositional
+
+
+# centered log-ratio cl_transform
+def tf_centered_log_ratio_transform(compositional, name='clrt'):
+    """Applies the centered log-ratio transform to compositional data."""
+    import tensorflow as tf
+    compositional = compositional[:] + eps
+    continuous = tf.log(compositional)
+    continuous -= tf.reduce_mean(continuous, axis=-1, keep_dims=True)
+    if name:
+        continuous = tf.identity(continuous, name=name)
+    return continuous
+
+
+# inverse centered log-ratio cl_transform
+def tf_inverse_centered_log_ratio_transform(continuous, name='iclrt'):
+    """Inverts the centered log-ratio transform, producing compositional data."""
+    import tensorflow as tf
+    compositional = tf.exp(continuous)
+    compositional /= tf.reduce_sum(compositional, axis=-1, keep_dims=True)
+    if name:
+        compositional = tf.identity(compositional, name=name)
+    return compositional
+
+
+# isometric log-ratio cl_transform
+def tf_isometric_log_ratio_transform(compositional, projection_matrix, name='ilrt'):
+    """Applies the isometric log-ratio transform to compositional data."""
+    import tensorflow as tf
+    continuous = tf_centered_log_ratio_transform(compositional, name=None)
+    continuous = tf.matmul(continuous, projection_matrix, name=name)
+    return continuous
+
+
+# inverse isometric log-ratio cl_transform
+def tf_inverse_isometric_log_ratio_transform(continuous, projection_matrix, name='iilrt'):
+    """Inverts the isometric log-ratio transform, producing compositional data."""
+    import tensorflow as tf
+    continuous = tf.matmul(continuous, projection_matrix, transpose_b=True)
+    compositional = tf_inverse_centered_log_ratio_transform(continuous, name=None)
+    return tf.identity(compositional, name=name)