5 years ago · af7f2b4ae8
--- a/labels/CLLDA_for_ICLabel.py
+++ b/labels/CLLDA_for_ICLabel.py
@@ -0,0 +1,212 @@
 
				+from load_website_data import load_icl
			
 
				+import numpy as np
			
 
				+try:
			
 
				+    from scipy.io import savemat
			
 
				+except ImportError:
			
 
				+    pass
			
 
				+from crowd_labeling import CLLDA, concurrent_cllda, combine_cllda
			
 
				+from crowd_labeling.MV import MV
			
 
				+import cPickle as pkl
			
 
				+import json
			
 
				+from os.path import join, isfile, isdir
			
 
				+import argparse
			
 
				+from copy import deepcopy
			
 
				+import sys
			
 
				+
			
 
				+# parse input arguments
			
 
				+print('parsing arguments')
			
 
				+parser = argparse.ArgumentParser(description='Run or update CL estimates.')
			
 
				+parser.add_argument('database', type=str, help='Absolute reference to the sqlite database file.')
			
 
				+parser.add_argument('save', type=str, help='Directory in which to save results.')
			
 
				+parser.add_argument('-classifications', type=str, help='Directory in which to save results for website viewing.',
			
 
				+                    default=None)
			
 
				+args = parser.parse_args()
			
 
				+database = args.database
			
 
				+path = args.save
			
 
				+classifications_path = args.classifications
			
 
				+assert isfile(database), 'database path does not exist'
			
 
				+assert isdir(path), 'save path does not exist'
			
 
				+assert isdir(classifications_path), 'classifications path does not exist'
			
 
				+
			
 
				+# load sqlite data
			
 
				+print('loading database')
			
 
				+icl_votes = load_icl(database)
			
 
				+votes = icl_votes['votes']
			
 
				+workers = icl_votes['workers']
			
 
				+instances = icl_votes['instances']
			
 
				+instance_set_numbers = icl_votes['instance_set_numbers']
			
 
				+instance_ic_numbers = icl_votes['instance_ic_numbers']
			
 
				+vote_ids = np.array(['Brain', 'Muscle', 'Eye', 'Heart', 'Line Noise', 'Chan Noise', 'Other', '?'])
			
 
				+instance_ids = icl_votes['instance_ids']
			
 
				+worker_ids = icl_votes['worker_ids']
			
 
				+T = icl_votes['n_classes']
			
 
				+C = icl_votes['n_responses']
			
 
				+A = icl_votes['n_workers']
			
 
				+is_expert = (icl_votes['is_expert']).astype(bool)  # type = np.ndarray
			
 
				+
			
 
				+
			
 
				+# append identifier to string
			
 
				+def add_identifier(string, identifier):
			
 
				+    return '_'.join((x for x in (string, identifier) if x is not None))
			
 
				+
			
 
				+
			
 
				+# run cllda
			
 
				+def run_cllda(save_path, vts, wks, its, vt_ids=None, it_ids=None, wk_ids=None, it_priors=None, wk_priors=None,
			
 
				+              it_set_numbers=None, it_ic_numbers=None, identifier=None):
			
 
				+
			
 
				+    if isfile(join(save_path, add_identifier('icl_cllda_models', identifier) + '.pkl')):
			
 
				+        # load for python
			
 
				+        with open(join(save_path, add_identifier('icl_cllda_models', identifier) + '.pkl'), 'rb') as f:
			
 
				+            cls = pkl.load(f)
			
 
				+
			
 
				+        # update CLLDA with all transforms
			
 
				+        cls = concurrent_cllda(cls, vts, wks, its, nprocs=4, vote_ids=vt_ids, instance_ids=it_ids,
			
 
				+                               worker_ids=wk_ids, worker_prior=wk_priors, num_epochs=800, burn_in=0)
			
 
				+
			
 
				+    else:
			
 
				+        # CLLDA with all transforms
			
 
				+        cls = concurrent_cllda(4, vts, wks, its, nprocs=4, vote_ids=vt_ids, instance_ids=it_ids,
			
 
				+                               worker_ids=wk_ids, worker_prior=wk_priors, instance_prior=it_priors,
			
 
				+                               transform=('none', 'ilr', 'clr', 'alr'), num_epochs=1000, burn_in=200)
			
 
				+
			
 
				+    # save individual models for python
			
 
				+    with open(join(save_path, add_identifier('icl_cllda_models', identifier) + '.pkl'), 'wb') as f:
			
 
				+        pkl.dump(cls, f)
			
 
				+
			
 
				+    # combine models
			
 
				+    cl = combine_cllda(cls)
			
 
				+
			
 
				+    # aggregate data
			
 
				+    return {
			
 
				+        'instance_ids': cl.instance_ids,
			
 
				+        'worker_ids': cl.worker_ids,
			
 
				+        'vote_ids': cl.vote_ids,
			
 
				+        'instance_set_numbers': it_set_numbers.astype(int),
			
 
				+        'instance_ic_numbers': it_ic_numbers.astype(int),
			
 
				+        'transform': cl.transform,
			
 
				+        'labels': cl.labels,
			
 
				+        'labels_cov': cl.labels_cov,
			
 
				+        'worker_mats': cl.worker_mats,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# save results in 3 different formats
			
 
				+def save_results(save_path, data, identifier=None):
			
 
				+    # save combined model for php
			
 
				+    print('saving for php')
			
 
				+    json_data = deepcopy(data)
			
 
				+    for key, val in json_data.iteritems():
			
 
				+        if isinstance(val, np.ndarray):
			
 
				+            json_data[key] = val.tolist()
			
 
				+        elif isinstance(val, list):
			
 
				+            for it, item in enumerate(val):
			
 
				+                if isinstance(item, np.ndarray):
			
 
				+                    val[it] = item.tolist()
			
 
				+            json_data[key] = val
			
 
				+    with open(join(save_path, add_identifier('ICLabels', identifier) + '.json'), 'wb') as f:
			
 
				+        json.dump(json_data, f)
			
 
				+
			
 
				+    # save combined model for python
			
 
				+    print('saving for python')
			
 
				+    with open(join(save_path, add_identifier('ICLabels', identifier) + '.pkl'), 'wb') as f:
			
 
				+        pkl.dump(data, f)
			
 
				+
			
 
				+    # save combined model for matlab
			
 
				+    if 'savemat' in sys.modules:
			
 
				+        print('saving for matlab')
			
 
				+        for key, val in data.iteritems():
			
 
				+            if not isinstance(val, np.ndarray):
			
 
				+                try:
			
 
				+                    val = np.array(val)
			
 
				+                except ValueError:
			
 
				+                    data[key] = np.empty(len(val), dtype=np.object)
			
 
				+                    for it, item in enumerate(val):
			
 
				+                        data[key][it] = item
			
 
				+                    continue
			
 
				+            if not np.issubdtype(val.dtype, np.number):
			
 
				+                data[key] = val.astype(np.object)
			
 
				+        savemat(join(save_path, add_identifier('ICLabels', identifier) + '.mat'), data)
			
 
				+
			
 
				+    # optionally save classifications for website viewing
			
 
				+    if isdir(classifications_path) and all((x in data.keys() for x in ('labels', 'vote_ids',
			
 
				+                                                                       'instance_set_numbers', 'instance_ic_numbers'))):
			
 
				+        path_str = join(classifications_path, add_identifier('website', identifier) + '_icl_')
			
 
				+        with open(path_str + 'index.json', 'w') as f:
			
 
				+            json.dump(zip(json_data['instance_set_numbers'], json_data['instance_ic_numbers']), f)
			
 
				+        with open(path_str + 'classifications.json', 'w') as f:
			
 
				+            try:
			
 
				+                json.dump(json_data['labels'][np.where(np.array(data['transform']) == 'none')[0][0]], f)
			
 
				+            except KeyError:
			
 
				+                json.dump(json_data['labels'], f)
			
 
				+        with open(path_str + 'classes.json', 'w') as f:
			
 
				+            json.dump(json_data['vote_ids'][:-1], f)
			
 
				+
			
 
				+
			
 
				+# CLLDA settings
			
 
				+n_pseudovotes_e = 100
			
 
				+n_pseudovotes_u = 1
			
 
				+expert_prior = n_pseudovotes_e * (np.hstack((np.eye(T), np.zeros((T, 1))))) + 0.01
			
 
				+user_prior = n_pseudovotes_u * (np.hstack((np.eye(T), np.zeros((T, 1))))) + 0.01
			
 
				+all_priors = np.zeros((A, T, C))
			
 
				+all_priors[is_expert.astype(np.bool), :, :] = np.tile(expert_prior[np.newaxis], [is_expert.sum(), 1, 1])
			
 
				+all_priors[np.logical_not(is_expert), :, :] = np.tile(user_prior[np.newaxis], [np.logical_not(is_expert).sum(), 1, 1])
			
 
				+instance_prior = np.histogram(votes, range(C))[0] / 100. / np.histogram(votes, range(C))[0].sum()
			
 
				+
			
 
				+# run and save CLLDA with experts
			
 
				+tag = 'expert'
			
 
				+print('Running CLLDA_' + tag + '...')
			
 
				+out = run_cllda(path, votes, workers, instances, vote_ids, instance_ids, worker_ids, instance_prior,
			
 
				+                all_priors, instance_set_numbers, instance_ic_numbers, tag)
			
 
				+print('Saved individual CLLDA_' + tag + ' models')
			
 
				+print('Saving combined results...')
			
 
				+save_results(path, out, tag)
			
 
				+print('Saved combined results')
			
 
				+
			
 
				+
			
 
				+# run CLLDA without experts
			
 
				+tag = 'noexpert'
			
 
				+print('Running CLLDA_' + tag + '...')
			
 
				+out = run_cllda(path, votes, workers, instances, vote_ids, instance_ids, worker_ids, instance_prior,
			
 
				+                user_prior, instance_set_numbers, instance_ic_numbers, tag)
			
 
				+print('Saved individual CLLDA_' + tag + ' models')
			
 
				+print('Saving combined results...')
			
 
				+save_results(path, out, tag)
			
 
				+print('Saved combined results')
			
 
				+
			
 
				+
			
 
				+# run and save with only luca
			
 
				+
			
 
				+# remove non-luca votes
			
 
				+worker_ids_lu = worker_ids[0]
			
 
				+luca_ind = np.in1d(workers, (0,))
			
 
				+votes_lu = votes[luca_ind]
			
 
				+workers_lu = workers[luca_ind]
			
 
				+instances_lu = instances[luca_ind]
			
 
				+
			
 
				+# remove instances with votes that are unsure
			
 
				+keep_index = np.logical_not(np.in1d(instances_lu, np.unique(instances_lu[votes_lu == 7])))
			
 
				+votes_lu = votes_lu[keep_index]
			
 
				+workers_lu = workers_lu[keep_index]
			
 
				+instances_lu = instances_lu[keep_index]
			
 
				+instance_ids_lu = instance_ids[np.unique(instances_lu)]
			
 
				+
			
 
				+# reset instance numbering
			
 
				+instance_set_numbers_lu = np.array(instance_set_numbers)[np.unique(instances_lu)]
			
 
				+instance_ic_numbers_lu = np.array(instance_ic_numbers)[np.unique(instances_lu)]
			
 
				+instances_lu = np.array([{x: y for x, y in zip(np.unique(instances_lu),
			
 
				+                                               np.arange(np.unique(instances_lu).size))}[z]
			
 
				+                         for z in instances_lu])
			
 
				+
			
 
				+# run MV
			
 
				+cl = MV(votes_lu, workers_lu, instances_lu)
			
 
				+
			
 
				+# save results
			
 
				+save_results(path, {
			
 
				+    'instance_ids': instance_ids_lu,
			
 
				+    'worker_ids': worker_ids_lu,
			
 
				+    'vote_ids': vote_ids,
			
 
				+    'instance_set_numbers': instance_set_numbers_lu,
			
 
				+    'instance_ic_numbers': instance_ic_numbers_lu,
			
 
				+    'labels': cl.labels,
			
 
				+}, 'onlyluca')
			
 
				+
			
--- a/labels/CLLDA_for_ICLabel_test.py
+++ b/labels/CLLDA_for_ICLabel_test.py
@@ -0,0 +1,104 @@
 
				+from load_website_data import load_icl_test
			
 
				+import numpy as np
			
 
				+from crowd_labeling.CLLDA import concurrent_cllda, combine_cllda
			
 
				+import cPickle as pkl
			
 
				+from scipy.io import savemat
			
 
				+
			
 
				+# load sqlite data
			
 
				+icl_votes = load_icl_test('database.sqlite')
			
 
				+votes_vec = icl_votes['votes']
			
 
				+votes_vec_workers = icl_votes['workers']
			
 
				+votes_vec_instances = icl_votes['instances']
			
 
				+instance_study_numbers = icl_votes['instance_study_numbers']
			
 
				+instance_set_numbers = icl_votes['instance_set_numbers']
			
 
				+instance_ic_numbers = icl_votes['instance_ic_numbers']
			
 
				+T = icl_votes['n_classes']
			
 
				+C = icl_votes['n_responses']
			
 
				+A = icl_votes['n_workers']
			
 
				+
			
 
				+# CLLDA settings
			
 
				+all_priors = np.tile(np.maximum(np.hstack((5*np.eye(T), np.zeros((T, 1)))), 0.01), [A, 1, 1])
			
 
				+instance_prior = np.histogram(votes_vec, range(C))[0] / 100. / np.histogram(votes_vec, range(C))[0].sum()
			
 
				+
			
 
				+# CLLDA with all transforms
			
 
				+cls = concurrent_cllda(4, votes_vec, votes_vec_workers, votes_vec_instances, nprocs=4,
			
 
				+                       worker_prior=all_priors, instance_prior=instance_prior,
			
 
				+                       transform=('none', 'ilr', 'clr', 'alr'), num_epochs=1000, burn_in=200)
			
 
				+
			
 
				+# combine models
			
 
				+cl = combine_cllda(cls)
			
 
				+
			
 
				+
			
 
				+
			
 
				+# CLLDA with all transforms weak
			
 
				+all_priors_weak = np.tile(np.maximum(np.hstack((np.eye(T), np.zeros((T, 1)))), 0.01), [A, 1, 1])
			
 
				+cls_weak = concurrent_cllda(4, votes_vec, votes_vec_workers, votes_vec_instances, nprocs=4,
			
 
				+                       worker_prior=all_priors_weak, instance_prior=instance_prior,
			
 
				+                       transform=('none', 'ilr', 'clr', 'alr'), num_epochs=1000, burn_in=200)
			
 
				+cl_weak = combine_cllda(cls_weak)
			
 
				+
			
 
				+# MV and DS and CLLDA
			
 
				+from crowd_labeling import MV
			
 
				+from crowd_labeling import DS
			
 
				+# ignoring "?"
			
 
				+ind = votes_vec != 7
			
 
				+temp_votes_vec = votes_vec[ind]
			
 
				+temp_votes_vec_workers = votes_vec_workers[ind]
			
 
				+temp_votes_vec_instances = votes_vec_instances[ind]
			
 
				+cls_ignore = concurrent_cllda(4, temp_votes_vec, temp_votes_vec_workers, temp_votes_vec_instances, nprocs=4,
			
 
				+                              worker_prior=all_priors, instance_prior=instance_prior,
			
 
				+                              transform=('none', 'ilr', 'clr', 'alr'), num_epochs=1000, burn_in=200)
			
 
				+cl_ignore = combine_cllda(cls_ignore)
			
 
				+_, temp_votes_vec_workers = np.unique(temp_votes_vec_workers, return_inverse=True)
			
 
				+_, temp_votes_vec_instances = np.unique(temp_votes_vec_instances, return_inverse=True)
			
 
				+mv_ignore = MV(temp_votes_vec, temp_votes_vec_workers, temp_votes_vec_instances)
			
 
				+ds_ignore = DS(temp_votes_vec, temp_votes_vec_workers, temp_votes_vec_instances)
			
 
				+# removing labels with "?"
			
 
				+ind = votes_vec == 7
			
 
				+to_remove = np.stack((votes_vec_workers[ind], votes_vec_instances[ind])).T
			
 
				+ind = np.ones_like(votes_vec, dtype=bool)
			
 
				+for it, vote in enumerate(np.stack((votes_vec_workers, votes_vec_instances)).T):
			
 
				+    if (vote == to_remove).all(1).any():
			
 
				+        ind[it] = False
			
 
				+temp_votes_vec = votes_vec[ind]
			
 
				+temp_votes_vec_workers = votes_vec_workers[ind]
			
 
				+temp_votes_vec_instances = votes_vec_instances[ind]
			
 
				+_, temp_votes_vec_workers = np.unique(temp_votes_vec_workers, return_inverse=True)
			
 
				+_, temp_votes_vec_instances = np.unique(temp_votes_vec_instances, return_inverse=True)
			
 
				+mv_remove = MV(temp_votes_vec, temp_votes_vec_workers, temp_votes_vec_instances)
			
 
				+ds_remove = DS(temp_votes_vec, temp_votes_vec_workers, temp_votes_vec_instances)
			
 
				+cls_remove = concurrent_cllda(4, temp_votes_vec, temp_votes_vec_workers, temp_votes_vec_instances, nprocs=4,
			
 
				+                              worker_prior=all_priors, instance_prior=instance_prior,
			
 
				+                              transform=('none', 'ilr', 'clr', 'alr'), num_epochs=1000, burn_in=200)
			
 
				+cl_remove = combine_cllda(cls_remove)
			
 
				+
			
 
				+
			
 
				+
			
 
				+# results to save
			
 
				+save = dict()
			
 
				+save['instance_labels'] = cl.labels[0]
			
 
				+save['instance_labels_ilr'] = cl.labels[1]
			
 
				+save['instance_labels_clr'] = cl.labels[2]
			
 
				+save['instance_labels_alr'] = cl.labels[3]
			
 
				+save['instance_label_cov'] = cl.labels_cov[0]
			
 
				+save['instance_label_cov_ilr'] = cl.labels_cov[1]
			
 
				+save['instance_label_cov_clr'] = cl.labels_cov[2]
			
 
				+save['instance_label_cov_alr'] = cl.labels_cov[3]
			
 
				+save['instance_id'] = cl.instance_ids
			
 
				+save['instance_number'] = votes_vec_instances
			
 
				+save['instance_study_numbers'] = instance_study_numbers
			
 
				+save['instance_set_numbers'] = instance_set_numbers
			
 
				+save['instance_ic_numbers'] = instance_ic_numbers
			
 
				+save['raw_instances'] = votes_vec_instances
			
 
				+save['raw_workers'] = votes_vec_workers
			
 
				+save['raw_votes'] = votes_vec
			
 
				+save['worker_mats'] = cl.worker_mats
			
 
				+save['worker_prior'] = all_priors[0]
			
 
				+save['instance_prior'] = instance_prior
			
 
				+save['num_epoch'] = 1000
			
 
				+save['burn_in'] = 200
			
 
				+
			
 
				+# save
			
 
				+with open('ICLabels_test.pkl', 'wb') as f:
			
 
				+    pkl.dump(save, f)
			
 
				+savemat('ICLabels_test.mat', save, oned_as='column')
			
--- a/labels/load_website_data.py
+++ b/labels/load_website_data.py
@@ -0,0 +1,102 @@
 
				+import sqlite3 as sql
			
 
				+import numpy as np
			
 
				+import pdb
			
 
				+
			
 
				+
			
 
				+def load_icl(db_path):
			
 
				+
			
 
				+    # load sqlite data
			
 
				+    connection = sql.connect(db_path)
			
 
				+    cursor = connection.cursor()
			
 
				+    cursor.execute('SELECT * FROM users')
			
 
				+    db_combined = cursor.fetchall()
			
 
				+    cursor.execute('SELECT * FROM labels')
			
 
				+    db_labels = cursor.fetchall()
			
 
				+    db_labels_column_names = [x[0] for x in cursor.description]
			
 
				+    cursor.execute('SELECT * FROM images')
			
 
				+    db_images = cursor.fetchall()
			
 
				+    connection.close()
			
 
				+    del connection, cursor
			
 
				+
			
 
				+    # remove users with not enough labels
			
 
				+    min_labels = 10
			
 
				+    user_labs = [x[1] for x in db_labels]
			
 
				+    user_labs_count = np.array([user_labs.count(x) for x in [x[0] for x in db_combined]])
			
 
				+    keep_users = np.where(user_labs_count >= min_labels)[0]
			
 
				+    db_combined = [db_combined[x] for x in keep_users]
			
 
				+    del user_labs_count
			
 
				+
			
 
				+    # remove labels from users with not enough labels
			
 
				+    db_labels = [x for x in db_labels if x[1] in [y[0] for y in db_combined]]
			
 
				+    del keep_users
			
 
				+
			
 
				+    # remove instances which only have "?" as an answer
			
 
				+    #   find all images with a ?
			
 
				+    #   for each of those images, find all labels
			
 
				+    #   if the labels are only ?, remove
			
 
				+    remove = list()
			
 
				+    for it in np.unique([x[2] for x in db_labels if x[10]]):
			
 
				+        if not np.sum([x[3:10] for x in db_labels if x[2] == it]):
			
 
				+            remove.append(it)
			
 
				+    if remove:
			
 
				+        db_labels = [x for x in db_labels if x[2] not in remove]
			
 
				+        NotImplementedError('there are some dead answers that need input')
			
 
				+
			
 
				+    # TODO: fix the above. doesn't catch everything
			
 
				+
			
 
				+    # aggregate images
			
 
				+    db_images = [db_images[y-1] for y in np.unique([x[2] for x in db_labels])]
			
 
				+
			
 
				+    # tabulate data
			
 
				+    I = len(set([x[2] for x in db_labels]))  # number of images
			
 
				+    A = len(db_combined)  # number of users and experts combined
			
 
				+
			
 
				+    # dictionary for all
			
 
				+    combined_ind = [x[0] for x in db_combined]
			
 
				+    combined_dict = {x: y for x, y in zip(combined_ind, range(A))}  # sqlite index to db_experts index
			
 
				+
			
 
				+    # dictionary for images
			
 
				+    im_ind = list(set([x[2] for x in db_labels]))
			
 
				+    im_ind.sort()
			
 
				+    im_dict = {x: y for x, y in zip(im_ind, range(I))}  # sqlite image_id to image index
			
 
				+
			
 
				+    # separate votes_mat
			
 
				+    votes_mat = np.array([x[3:11] for x in db_labels])
			
 
				+    is_expert = np.array([x[4] for x in db_combined])
			
 
				+    # is_expert[0] = 0
			
 
				+
			
 
				+    # index votes_mat
			
 
				+    iV = np.array([im_dict[x[2]] for x in db_labels])
			
 
				+    uV = np.array([combined_dict[x[1]] for x in db_labels])
			
 
				+
			
 
				+    # tabulate more data
			
 
				+    V = len(votes_mat)  # number of total votes_mat
			
 
				+    T = 7  # number of topics (estimated truth)
			
 
				+    C = T + 1  # number of categories (options for voting)
			
 
				+
			
 
				+    # reshape votes_mat
			
 
				+    nz = np.nonzero(votes_mat)
			
 
				+    votes_vec = nz[1]
			
 
				+    votes_vec_workers = uV[nz[0]]
			
 
				+    votes_vec_instances = iV[nz[0]]
			
 
				+    VV = len(votes_vec)
			
 
				+
			
 
				+    # dataset info
			
 
				+    instance_set_numbers = np.array([x[2] for x in db_images])
			
 
				+    instance_ic_numbers = np.array([x[3] for x in db_images])
			
 
				+    instance_ids = np.array([x[0] for x in db_images])
			
 
				+
			
 
				+    return {'votes': votes_vec,
			
 
				+            'workers': votes_vec_workers,
			
 
				+            'instances': votes_vec_instances,
			
 
				+            'is_expert': is_expert,
			
 
				+            'instance_set_numbers': instance_set_numbers,
			
 
				+            'instance_ic_numbers': instance_ic_numbers,
			
 
				+            'instance_ids': instance_ids,
			
 
				+            'worker_ids': np.array([x[1] for x in db_combined]),
			
 
				+            'vote_ids': np.array(db_labels_column_names[3:11]),
			
 
				+            'n_votes': V,
			
 
				+            'n_classes': T,
			
 
				+            'n_responses': C,
			
 
				+            'n_instances': I,
			
 
				+            'n_workers': A}