#encoding:latin-1 import numpy as np def get_top_n_variables(X,Y, n, eps=1e-2, maxiter = 1000000, verbose=0): """ forward stagewise regression for regression model selection. according to: [1] efron, hastie, johnstone, tibshirani "least angle regression" the anals of statistsics, 2004, vol 32, pp 407-451, doi 10.1214/009053604000000067 """ num_samples, num_feat = X.shape # scale predictors to mean = 0 and stdev = 1: mvx = X.mean(axis=0) stdx= X.std(axis=0) X-=mvx X/=stdx # initialize prediciton vector mue = np.zeros((num_samples,)) # initialize active variables isactive=np.zeros((num_feat,), dtype=np.int) for i in range(maxiter): # calculate correlation of residual with # predictors c = np.dot(X.T, Y-mue) # find max correlation jot = np.argmax(abs(c)) # update activate variables if isactive[jot] == 0: isactive[jot] = 1 if verbose: print "%9d : %d %s" % (i, sum(isactive), np.where(isactive)[0]) if sum(isactive)==n: break # update prediction vector mue += eps*X[:,jot]*np.sign(c[jot]) return np.where(isactive)[0] if __name__ == "__main__": from scipy.io import read_array # read data used in [1] data=read_array("diabetes_unscaled.dat", lines=(1,-1)) X=data[:,:10] Y=data[:,10] print "determine top 5 relevant variables:" print vars = get_top_n_variables(X, Y, 5, verbose=1) print print "important variables: ", vars