1*34b254c5SRichard Tran Mills# Linear regression with synthetic data generation 2*34b254c5SRichard Tran Mills# ================================================ 3*34b254c5SRichard Tran Mills# 4*34b254c5SRichard Tran Mills# Generate a synthetic data set using sklearn.datasets.make_regression() and 5*34b254c5SRichard Tran Mills# then solve it using the petsc4py interface to PetscRegressor. 6*34b254c5SRichard Tran Mills 7*34b254c5SRichard Tran Millsimport numpy as np 8*34b254c5SRichard Tran Mills# Needed for plotting 9*34b254c5SRichard Tran Millsimport matplotlib.colors 10*34b254c5SRichard Tran Millsimport matplotlib.pyplot as plt 11*34b254c5SRichard Tran Mills 12*34b254c5SRichard Tran Mills# Needed for generating classification, regression and clustering datasets 13*34b254c5SRichard Tran Millsimport sklearn.datasets as dt 14*34b254c5SRichard Tran Mills 15*34b254c5SRichard Tran Mills# Needed for evaluating the quality of the regression for the test data set 16*34b254c5SRichard Tran Millsfrom sklearn.metrics import mean_squared_error 17*34b254c5SRichard Tran Mills 18*34b254c5SRichard Tran Millsimport argparse 19*34b254c5SRichard Tran Millsparser = argparse.ArgumentParser('Test MLG using synthetic data') 20*34b254c5SRichard Tran Millsparser.add_argument('--nsample', type=int, default=10000) 21*34b254c5SRichard Tran Millsparser.add_argument('--nfeature', type=int, default=10) 22*34b254c5SRichard Tran Millsparser.add_argument('--add_noise', action='store_true') 23*34b254c5SRichard Tran Millsargs, unknown = parser.parse_known_args() 24*34b254c5SRichard Tran Mills 25*34b254c5SRichard Tran Millsimport sys 26*34b254c5SRichard Tran Millsimport petsc4py 27*34b254c5SRichard Tran Millssys.argv = [sys.argv[0]] + unknown 28*34b254c5SRichard Tran Millspetsc4py.init(sys.argv) 29*34b254c5SRichard Tran Millsfrom petsc4py import PETSc 30*34b254c5SRichard Tran Mills 31*34b254c5SRichard Tran Mills# Define the seed so that results can be reproduced 32*34b254c5SRichard Tran Millsseed = 11 33*34b254c5SRichard Tran Millsrand_state = 11 34*34b254c5SRichard Tran Mills 35*34b254c5SRichard Tran Mills# Define the color maps for plots 36*34b254c5SRichard Tran Millscolor_map = plt.cm.get_cmap('RdYlBu') 37*34b254c5SRichard Tran Millscolor_map_discrete = matplotlib.colors.LinearSegmentedColormap.from_list("", ["red","cyan","magenta","blue"]) 38*34b254c5SRichard Tran Mills 39*34b254c5SRichard Tran Millsdef petsc_regression_test(nsample, nfeature, noise=0, rand_state=11): 40*34b254c5SRichard Tran Mills ntr = round(nsample*9/10) 41*34b254c5SRichard Tran Mills nte = nsample - ntr 42*34b254c5SRichard Tran Mills x,y = dt.make_regression(n_samples=nsample, 43*34b254c5SRichard Tran Mills n_features=nfeature, 44*34b254c5SRichard Tran Mills noise=noise, 45*34b254c5SRichard Tran Mills random_state=rand_state) 46*34b254c5SRichard Tran Mills x_train, y_train = x[:ntr,], y[:ntr] 47*34b254c5SRichard Tran Mills xte, yte = x[ntr:,], y[ntr:] 48*34b254c5SRichard Tran Mills comm = PETSc.COMM_WORLD 49*34b254c5SRichard Tran Mills rank = comm.getRank() 50*34b254c5SRichard Tran Mills regressor = PETSc.Regressor().create(comm=comm) 51*34b254c5SRichard Tran Mills regressor.setType(PETSc.Regressor.Type.LINEAR) 52*34b254c5SRichard Tran Mills rows_ix = np.arange(ntr,dtype=np.int32) 53*34b254c5SRichard Tran Mills cols_ix = np.arange(nfeature,dtype=np.int32) 54*34b254c5SRichard Tran Mills X = PETSc.Mat().create(comm=comm) 55*34b254c5SRichard Tran Mills X.setSizes((ntr,nfeature)) 56*34b254c5SRichard Tran Mills X.setFromOptions() 57*34b254c5SRichard Tran Mills X.setUp() 58*34b254c5SRichard Tran Mills y = PETSc.Vec().create(comm=comm) 59*34b254c5SRichard Tran Mills y.setSizes(ntr) 60*34b254c5SRichard Tran Mills y.setFromOptions() 61*34b254c5SRichard Tran Mills if not rank : 62*34b254c5SRichard Tran Mills X.setValues(rows_ix,cols_ix,x_train,addv=True) 63*34b254c5SRichard Tran Mills y.setValues(rows_ix,y_train,addv=False) 64*34b254c5SRichard Tran Mills X.assemblyBegin(X.AssemblyType.FINAL) 65*34b254c5SRichard Tran Mills X.assemblyEnd(X.AssemblyType.FINAL) 66*34b254c5SRichard Tran Mills y.assemblyBegin() 67*34b254c5SRichard Tran Mills y.assemblyEnd() 68*34b254c5SRichard Tran Mills regressor.fit(X,y) 69*34b254c5SRichard Tran Mills rows_ix = np.arange(nte,dtype=np.int32) 70*34b254c5SRichard Tran Mills X = PETSc.Mat().create(comm=comm) 71*34b254c5SRichard Tran Mills X.setSizes((nte,nfeature)) 72*34b254c5SRichard Tran Mills X.setFromOptions() 73*34b254c5SRichard Tran Mills X.setUp() 74*34b254c5SRichard Tran Mills y = PETSc.Vec().create(comm=comm) 75*34b254c5SRichard Tran Mills y.setSizes(nte) 76*34b254c5SRichard Tran Mills y.setFromOptions() 77*34b254c5SRichard Tran Mills if not rank : 78*34b254c5SRichard Tran Mills X.setValues(rows_ix,cols_ix,xte,addv=True) 79*34b254c5SRichard Tran Mills y.zeroEntries() 80*34b254c5SRichard Tran Mills X.assemblyBegin(X.AssemblyType.FINAL) 81*34b254c5SRichard Tran Mills X.assemblyEnd(X.AssemblyType.FINAL) 82*34b254c5SRichard Tran Mills y.assemblyBegin() 83*34b254c5SRichard Tran Mills y.assemblyEnd() 84*34b254c5SRichard Tran Mills regressor.predict(X,y) 85*34b254c5SRichard Tran Mills ypr = y.getArray() 86*34b254c5SRichard Tran Mills error = mean_squared_error(ypr,yte) 87*34b254c5SRichard Tran Mills print(f"Test MSE: {error:f}") 88*34b254c5SRichard Tran Mills return xte,ypr 89*34b254c5SRichard Tran Mills 90*34b254c5SRichard Tran Millsxte,ypr = petsc_regression_test(args.nsample, args.nfeature, 0) 91*34b254c5SRichard Tran Mills 92*34b254c5SRichard Tran Millsif args.add_noise: 93*34b254c5SRichard Tran Mills fig,ax = plt.subplots(nrows=2, ncols=3,figsize=(16,7)) 94*34b254c5SRichard Tran Mills plt_ind_list = np.arange(6)+231 95*34b254c5SRichard Tran Mills 96*34b254c5SRichard Tran Mills for noise,plt_ind in zip([0,0.1,1,10,100,1000],plt_ind_list): 97*34b254c5SRichard Tran Mills xte,ypr = petsc_regression_test(args.nsample, args.nfeature, noise, rand_state) 98*34b254c5SRichard Tran Mills 99*34b254c5SRichard Tran Mills plt.subplot(plt_ind) 100*34b254c5SRichard Tran Mills my_scatter_plot = plt.scatter(xte[:,0], 101*34b254c5SRichard Tran Mills xte[:,1], 102*34b254c5SRichard Tran Mills c=ypr, 103*34b254c5SRichard Tran Mills vmin=min(ypr), 104*34b254c5SRichard Tran Mills vmax=max(ypr), 105*34b254c5SRichard Tran Mills s=35, 106*34b254c5SRichard Tran Mills cmap=color_map) 107*34b254c5SRichard Tran Mills 108*34b254c5SRichard Tran Mills plt.title('noise: '+str(noise)) 109*34b254c5SRichard Tran Mills plt.colorbar(my_scatter_plot) 110*34b254c5SRichard Tran Mills 111*34b254c5SRichard Tran Mills fig.subplots_adjust(hspace=0.3,wspace=.3) 112*34b254c5SRichard Tran Mills plt.suptitle('PETSc regression tests with different noise levels',fontsize=20) 113*34b254c5SRichard Tran Mills plt.show(block=True) 114