cimport liblinear cimport liblinear_enums import zlib L2_LR = liblinear_enums.L2_LR L2LOSS_SVM_DUAL = liblinear_enums.L2LOSS_SVM_DUAL L2LOSS_SVM = liblinear_enums.L2LOSS_SVM L1LOSS_SVM_DUAL = liblinear_enums.L1LOSS_SVM_DUAL MCSVM_CS = liblinear_enums.MCSVM_CS version = "1.1.5" # TODO: scipy sparse matrices !!! def __fromstring(txt): """ needed later on for pickling LinearSVM objects """ return LinearSVM.fromstring(zlib.decompress(txt)) #=============================================================================== # # important note: # we allow indices of sparse vectors >=0, but linearsvm library # wants >= 1, so we have to increase index by one when processing input # data # #=============================================================================== def vector2sparse(vec): """ convert list of values to its sparse representation """ return list(enumerate(vec)) def matrix2sparse(mat): """ convert matrix or list of lists to sparse representation """ if mat.ndim != 2: raise ValueError("mat argument is not a matrix") rv = [] for row in mat: rv.append(list(enumerate(row))) return rv cdef int find_max_column(data): """ find maximal column in sparse matrix """ cdef int num_data = len(data) cdef int i, j, idx cdef int maxidx = - 1 for i from 0 <= i < num_data: vec = data[i] n = len(vec) for j from 0 <= j < n: idx = vec[j][0] idx += 1 # +1: see comment above ! if idx <= 0: # invalid, not allowd to happen ! return - 1 if idx > maxidx: maxidx = idx return maxidx cdef feature_node * build_vector(list vec, int num_feat): """ builds vector from python list. list contains tuples (index, value), each vector gets value 1 at its end for incorporating bias term """ cdef: int n = len(vec) int j, idx feature_node * nodes nodes = < feature_node *> malloc((n + 2) * sizeof(feature_node)) for j from 0 <= j < n: node = vec[j] idx = node[0] nodes[j].index = idx + 1 # +1: see "important note" above nodes[j].value = node[1] nodes[n].index = num_feat + 1 # bias node nodes[n].value = 1 # bias "feature" nodes[n + 1].index = - 1 # end marker return nodes #---------------------------------------------------------------- wrapper class cdef class Parameter: cdef parameter param def __init__(self, mach, eps, C, weights, num_classes): if weights is not None: if len(weights) != num_classes: raise ValueError("data and weights do not fit") else: weights = [1.0] * num_classes self.param.solver_type = mach self.param.eps = eps self.param.C = C self.param.nr_weight = num_classes self.param.weight = < double *> malloc(num_classes * sizeof(double)) self.param.weight_label = < int *> malloc(num_classes * sizeof(int)) cdef int i for i from 0 <= i < num_classes: self.param.weight[i] = weights[i] self.param.weight_label[i] = i def release(self): free(self.param.weight) free(self.param.weight_label) #----------------------------------------------------------------- wrapper class cdef class Problem: """ wraps liblinears problem struct """ cdef problem prob cdef int num_feat cdef int num_classes cdef int num_all def __init__(self, data): cdef int num_classes = len(data) cdef int * num_in_class cdef int num_all cdef int num_feat cdef int i, j, n cdef problem prob #----------------------------------------------------- count class sizes num_all = 0 num_in_class = < int *> malloc(num_classes * sizeof(int)) for i from 0 <= i < num_classes: num_in_class[i] = len(data[i]) num_all += num_in_class[i] prob.l = num_all #------------------------------------------------------------ set labels prob.y = < int *> malloc(num_all * sizeof(int)) n = 0 for i from 0 <= i < num_classes: for j from 0 <= j < num_in_class[i]: prob.y[n] = i n += 1 #----------- get max column index of sparse vectors = number of features num_feat = - 1 for i from 0 <= i < num_classes: n = find_max_column(data[i]) if n > num_feat: num_feat = n #------------------------------------------ build sparse feature vectors prob.x = < feature_node **> malloc(num_all * sizeof(feature_node *)) n = 0 for i from 0 <= i < num_classes: for j from 0 <= j < num_in_class[i]: prob.x[n] = build_vector(data[i][j], num_feat) n += 1 prob.bias = 1 prob.n = num_feat + 1 # +1 because bias term self.prob = prob self.num_feat = num_feat self.num_all = num_all self.num_classes = num_classes def release(self): """ frees memory """ cdef int i free(self.prob.y) for i from 0 <= i < self.num_all: free(self.prob.x[i]) free(self.prob.x) #----------------------------------------------------------------- wrapper class cdef class LinearSVM: cdef model * mod def __init__(self, *a, **b): if len(a) or len(b): raise ValueError("you should not use __init__ for training, use" "train() instead") self.mod = NULL @classmethod def train(cls, data , mach=L2LOSS_SVM_DUAL, float C=1.0, float eps=0.01, \ weights=None, verbose=0): cdef LinearSVM rv = LinearSVM() cdef int num_classes = len(data) if verbose: print "VERBOSE" print "got", num_classes, "classes to learn" if num_classes<2: raise RuntimeError("need at least two classes") try: data[0][0][0] except: raise RuntimeError("data has wrong dimension, must be" " tuple of vector lists") if verbose: print "build Problem" cdef Problem prob = Problem(data) if verbose: print "configure machine" cdef Parameter param = Parameter(mach, eps, C, weights, num_classes) #--------------------------------------------------- check configuration cdef object py_message cdef char * message message = < char *> check_parameter(& prob.prob, & param.param) if message: py_message = message raise RuntimeError("param invalid: " + py_message) #------------------------------------------------------ train classifier if verbose: print "start training" cdef model * mod = train(& prob.prob, & param.param, verbose) if verbose: print "finished training" #--------------------------------------------------------- store results rv.mod = mod #------------------------------------------------------- free aux memory prob.release() param.release() return rv def build_confusion_matrix(self, data): """ builds confusion matrix of 'data' with learnt model """ cdef model * mod = self.mod if len(data) != mod.nr_class: raise ValueError("need %d classes to evaluate" % mod.nr_class) cdef list mat_row cdef list confusion_matrix = [] for tobe, dataset in enumerate(data): mat_row = [0] * mod.nr_class for vec in dataset: classified_as = self.predict(vec) mat_row[classified_as] += 1 confusion_matrix.append(mat_row) return confusion_matrix def tostring(self): """ converts model to text representation """ cdef char * cc cc = to_string(self.mod) res = PyString_FromStringAndSize (cc, strlen(cc)) free(cc) return res @classmethod def fromstring(cls, input): """ builds model from text representatino """ cdef LinearSVM rv = LinearSVM() rv.mod = from_string(input) if rv.mod == NULL: raise ValueError("error when creating model from stream") return rv def __reduce__(self): """ needed for pickling this extension class """ return __fromstring, (zlib.compress(self.tostring()),) def predict(self, vec): """ classify vector """ cdef feature_node * v = build_vector(vec, self.mod.nr_feature) cdef int val = predict(self.mod, v) free(v) return val def predict_probabilites(self, vec): """ calculate class probabiliteis of vector """ if self.mod.param.solver_type != L2_LR: raise ValueError("method only allowed for L2_LR method") cdef int nc = self.mod.nr_class cdef feature_node * v = build_vector(vec, self.mod.nr_feature) cdef double * probs = < double *> malloc(nc * sizeof(double)) predict_probability(self.mod, v, probs) rv = [0] * nc cdef int i for i from 0 <= i < nc: rv[i] = probs[i] free(v) free(probs) return rv def predict_values(self, vec): """ decision values for each class This function returns nr_w decision values, nr_w is 1 if there are two classes except multi-class svm by Crammer and Singer, and is the number of classes otherwise. """ cdef feature_node * v = build_vector(vec, self.mod.nr_feature) # calculate number of outputs (nr_w) cdef int nc = self.mod.nr_class cdef int nr_w = 1 if (nc == 2 and self.mod.param.solver_type != MCSVM_CS) else nc cdef double * dec_values = < double *> malloc(nr_w * sizeof(double)) predict_values(self.mod, v, dec_values) dv = [ dec_values[i] for i in range(nr_w) ] free(v) free(dec_values) return dv def get_weights(self): """ get weights of linear svm """ cdef int nw = self.mod.nr_feature cdef int nc = self.mod.nr_class cdef list rv = [] cdef int i cdef int num_w = 1 if (nc == 2 and self.mod.param.solver_type != MCSVM_CS) else nc for i from 0<=i malloc(prob.num_all * sizeof(int)) cross_validation(& prob.prob, & param.param, num_folds, target, verbose) # build confusion matrix, attention: rows must be copies, else # we get problems due to references # try out: # confusion_matrix = [[0]*2]*2 # confusion_matrix[1][1] = 1 # print confusion_matrix # and you will see !! cdef list confusion_row = [0] * num_classes cdef list confusion_matrix = [] cdef int i for i from 0 <= i < num_classes: confusion_matrix.append(confusion_row[:]) for i from 0 <= i < prob.num_all: tobe = prob.prob.y[i] confusion_matrix[tobe][target[i]] += 1 prob.release() param.release() return confusion_matrix