2022-04-12

#### Section 4: demo code for xgboost (Extreme GB) ####

# --------------------------------------------------------

rm(list=ls())? #?clear the environment

library(ISLR)? # contains the data

library(xgboost) # XGBoost... and xgb.DMatrix()

library(caret)?

set.seed(4061)? # for reproducibility

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change the response variable to a factor to make this a

# classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "High","Low"))

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set for later

# x = select(dat.train,-"Salary") ###?if using dplyr

# training set:

x = dat.train

x$Salary = NULL

y = dat.train$Salary

# test set:

x.test = dat.validation

x.test$Salary = NULL

y.test = dat.validation$Salary

# XGBoost...

set.seed(4061)

# (a) line up the data in the required format

# train set:

xm = model.matrix(y~., data=x)[,-1]

x.xgb = xgb.DMatrix(xm)

# test set:

xm.test = model.matrix(y.test~., x.test)[,-1]

x.xgb.test = xgb.DMatrix(xm.test)

# (b) training...

# NB: one can run xgbboost() with default parameters by using:

xgb.ctrl = trainControl(method="none")

# otherwise:

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.model = train(x.xgb, y, trControl=xgb.ctrl, method="xgbTree")

# NB: use argument tuneGrid to specify custom grids of values for

# tuning parameters. Otherwise train() picks its own grids.

xgb.model$bestTune

# (c) testing...

xgb.pred = predict(xgb.model, newdata=x.xgb.test)

confusionMatrix(data=xgb.pred, reference=y.test)

# --------------------------------------------------------

# The below demo code is only for information. There is no need

# to spend time looking into it for ST4061/ST6041 tests/exam!

#

# There are a number of parameters to be tuned for XGBoost:

modelLookup('xgbTree')

# All or a subset of these parameters can be tuned in a sequential

# manner. For each tuning parameter, we can define a grid of

# potential values and search for an optimal value within that grid.

#

# Careful! Running this code will take some time...

#

# (1) Max number of trees (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

? ? ? ? ? ? ? ? ? ? ? ? eta = c(0.025, 0.05, 0.1, 0.3),

? ? ? ? ? ? ? ? ? ? ? ? max_depth = c(2, 3, 4, 5, 6),

? ? ? ? ? ? ? ? ? ? ? ? gamma = 0,

? ? ? ? ? ? ? ? ? ? ? ? colsample_bytree = 1,

? ? ? ? ? ? ? ? ? ? ? ? min_child_weight = 1,

? ? ? ? ? ? ? ? ? ? ? ? subsample = 1)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

#

# (2) Max tree depth and min child weight (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

? ? ? ? ? ? ? ? ? ? ? ? eta = xgb.tune$bestTune$eta,

? ? ? ? ? ? ? ? ? ? ? ? max_depth = c(1:xgb.tune$bestTune$max_depth+2),

? ? ? ? ? ? ? ? ? ? ? ? gamma = 0,

? ? ? ? ? ? ? ? ? ? ? ? colsample_bytree = 1,

? ? ? ? ? ? ? ? ? ? ? ? min_child_weight = c(1:3),

? ? ? ? ? ? ? ? ? ? ? ? subsample = 1)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

#

# (3) sampling (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

? ? ? ? ? ? ? ? ? ? ? ? eta = xgb.tune$bestTune$eta,

? ? ? ? ? ? ? ? ? ? ? ? max_depth = xgb.tune$bestTune$max_depth,

? ? ? ? ? ? ? ? ? ? ? ? gamma = 0,

? ? ? ? ? ? ? ? ? ? ? ? colsample_bytree = seq(0.2,1,by=.2),

? ? ? ? ? ? ? ? ? ? ? ? min_child_weight = xgb.tune$bestTune$min_child_weight,

? ? ? ? ? ? ? ? ? ? ? ? subsample = seq(.5,1,by=.1))

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

#

# (4) gamma (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

? ? ? ? ? ? ? ? ? ? ? ? eta = xgb.tune$bestTune$eta,

? ? ? ? ? ? ? ? ? ? ? ? max_depth = xgb.tune$bestTune$max_depth,

? ? ? ? ? ? ? ? ? ? ? ? gamma = seq(0,1,by=.1),

? ? ? ? ? ? ? ? ? ? ? ? colsample_bytree = xgb.tune$bestTune$colsample_bytree,

? ? ? ? ? ? ? ? ? ? ? ? min_child_weight = xgb.tune$bestTune$min_child_weight,

? ? ? ? ? ? ? ? ? ? ? ? subsample = xgb.tune$bestTune$subsample)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

#

# (5) learning rate (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 5000, by=100),

? ? ? ? ? ? ? ? ? ? ? ? eta = c(0.01,0.02,0.05,0.075,0.1),

? ? ? ? ? ? ? ? ? ? ? ? max_depth = xgb.tune$bestTune$max_depth,

? ? ? ? ? ? ? ? ? ? ? ? gamma = xgb.tune$bestTune$gamma,

? ? ? ? ? ? ? ? ? ? ? ? colsample_bytree = xgb.tune$bestTune$colsample_bytree,

? ? ? ? ? ? ? ? ? ? ? ? min_child_weight = xgb.tune$bestTune$min_child_weight,

? ? ? ? ? ? ? ? ? ? ? ? subsample = xgb.tune$bestTune$subsample)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

#

# Then fit:

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# testing:

xgb.pred = predict(xgb.model, newdata=x.xgb.test)

confusionMatrix(data=xgb.pred, reference=y.test)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Section 5: SVM Support Vector Machine ####

# --------------------------------------------------------

rm(list=ls())? # clear out running environment

library(randomForest)

library(class)

library(pROC)

library(e1071)

#SVM:一種復(fù)雜的分類工具

#### Example using SVM on the "best-student" data ####

# simulate data:生成隨機(jī)數(shù)

set.seed(1)

n = 100

mark = rnorm(n, m=50, sd=10)

choc = rnorm(n, m=60, sd=5)

summary(mark)

summary(choc)

#評分標(biāo)準(zhǔn)--f(x) separating hyperplane: f(x)=int+a*mark + b*choc

int = 10

a = 2

b = 4

# rating of students on basis of their marks and level of appreciation of chocolate:

# 根據(jù)學(xué)生的分?jǐn)?shù)和對巧克力的欣賞程度給他們打分

#生成data set:

mod = int + a*mark + b*choc? # values for true model,生成separating hyperplane,該線/平面上是理論值

z = rnorm(n,s=8)? # additive noise

obs = mod + z #圍繞separating hyperplane生成一系列實(shí)際觀測值

#分類:

y = as.factor(ifelse(obs>350,1,0))? # classification data,y記錄了每個實(shí)際觀測值所對應(yīng)的真實(shí)分類

plot(mod, obs, xlab="model", ylab="observations", pch=20, cex=2)

plot(mark, choc, xlab="x1 (mark)", ylab="x2 (choc)",

? ? pch=20, col=c(2,4)[as.numeric(y)], cex=2)

legend("topright", box.col=8, bty='n',

? ? ? legend=c("good student","better student"),

? ? ? pch=15, col=c(2,4))

table(y)#可以看到how complicated the data set is to be classify

set.seed(1)

# split the data into train+test (50%-50%):

# 生成的dataset是由觀測值mark&choc組成的x,對應(yīng)分類組成的y構(gòu)成的

x = data.frame(mark,choc)

i.train = sample(1:n, 50)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

class(x.train)

class(y.train)

xm = as.matrix(x.train)

# fit an SVM as follows:

# ?svm # in e1071

set.seed(1)

svmo = svm(xm, y.train, kernel='polynomial') #kernel參數(shù)可換,根據(jù)plot圖像看一下大概是什么分割線

#把x和y輸入,通過肉眼觀察,假定kernel是多項(xiàng)式(曲線)的

#直線linear、曲線polynomial、圓radial basis

svmo

names(svmo)

# cf. ?svm:

# "Parameters of SVM-models usually must be tuned to yield sensible results!"

# 支持向量機(jī)模型的參數(shù)通常必須調(diào)整以產(chǎn)生合理的結(jié)果

#為了找到最合理的svm參數(shù)(find the maximum-margin hyperplane,離margin近的點(diǎn)才對其有影響)

# one can tune the model as follows:在指定范圍ranges&gamma內(nèi)找到最合適的Parameters

set.seed(1)

svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,

? ? ? ? ? ? ? ? ? ? ? kernel='polynomial',

? ? ? ? ? ? ? ? ? ? ? ranges=list(cost=10^(-2:4), gamma=c(0.25,0.5,1,1.5,2)))

? ? ? ? ? ? ? ? ? ? # 這里列出一系列想要嘗試的調(diào)節(jié)參數(shù)

svm.tune

svm.tune$best.parameters#最好的

# then fit final SVM for optimal parameters:測試樣本

svmo.final = svm(xm, y.train, kernel='polynomial',

? ? ? ? ? ? ? ? gamma=svm.tune$best.parameters$gamma,

? ? ? ? ? ? ? ? cost=svm.tune$best.parameters$cost)

# corresponding confusion matrices:混淆矩陣看分的怎么樣

table(svmo$fitted,y.train)

table(svmo.final$fitted,y.train)

# we can also use caret for easy comparison:直接看accuracy

library(caret)

caret::confusionMatrix(svmo$fitted,y.train)$overall[1]

caret::confusionMatrix(svmo.final$fitted,y.train)$overall[1]

# assessing model fit to training data評估模型是否適合訓(xùn)練數(shù)據(jù)

identical(fitted(svmo), svmo$fitted)#TRUE——got the right information here

# to identify support vectors: 能左右hyperplane的點(diǎn)

# either svmo$index (indices), or svmo$SV (coordinates)

length(svmo$index)#tuned前的

length(svmo.final$index)#tuned后的,tuned后能左右hyperplane的點(diǎn)變少了,說明分得更干凈了,svm擬合的結(jié)果更好了

# visualize:

plot(x.train, pch=20, col=c(1,2)[y.train], cex=2)

points(svmo$SV, pch=14, col=4, cex=2) # explain why this does not work!?

# apply scaling to dataset to see SV's:一定要先歸一化,才能描出用到的點(diǎn)(用于找到最好參數(shù)的離margin近的點(diǎn),能左右hyperplane的點(diǎn))

plot(apply(x.train,2,scale), pch=20, col=c(1,2)[y.train], cex=2)

points(svmo$SV, pch=14, col=4, cex=2)

points(svmo.final$SV, pch=5, col=3, cex=2)

# If you want to use predict(), use a formula-type

# expression when calling svm(). Because of this,

# we re-shape our dataset:

#如果你想使用predict(),在調(diào)用svm()時使用公式類型的表達(dá)式。因此,我們重新塑造我們的數(shù)據(jù)集:

dat.train = data.frame(x=x.train, y=y.train)

dat.test = data.frame(x=x.test)

# decision boundary visualization:

svmo = svm(y~., data=dat.train)

plot(svmo, dat.train,

? ? svSymbol = 15, dataSymbol = 'o',

? ? col=c('cyan','pink')) # this is plot.svm()

svmo.final = svm(y~., data=dat.train, kernel='polynomial',

? ? ? ? ? ? ? ? gamma=svm.tune$best.parameters$gamma,

? ? ? ? ? ? ? ? cost=svm.tune$best.parameters$cost) #最好的擬合

plot(svmo.final, dat.train,

? ? svSymbol = 15, dataSymbol = 'o',

? ? col=c('cyan','pink')) # this is plot.svm()

# How to generate predictions from SVM fit:

# fitting the SVM model:

svmo = svm(y~., data=dat.train,

? ? ? ? ? kernel='polynomial',

? ? ? ? ? gamma=svm.tune$best.parameters$gamma,

? ? ? ? ? cost=svm.tune$best.parameters$cost)

# Note that if we need probabilities P(Y=1)

# (for example to calculate ROC+AUC),

# we need to set 'probability=TRUE' also in

# fitting the SVM model:

svmo = svm(y~., data=dat.train, probability=TRUE,

? ? ? ? ? kernel='polynomial',

? ? ? ? ? gamma=svm.tune$best.parameters$gamma,

? ? ? ? ? cost=svm.tune$best.parameters$cost)

#Generate predictions from SVM fit:

svmp = predict(svmo, newdata=dat.test, probability=TRUE)

roc.svm = roc(response=y.test, predictor=attributes(svmp)$probabilities[,2])

roc.svm$auc#越接近1越好

plot(roc.svm)#very happy

# compare with RF:

rf = randomForest(y~., data=dat.train)

rfp = predict(rf, dat.test, type='prob')

roc.rf = roc(y.test, rfp[,2])

roc.rf$auc

plot(roc.svm)

par(new=TRUE)

plot(roc.rf, col='yellow')

legend("bottomright", bty='n',

? ? ? legend=c("RF","SVM"),

? ? ? lty=1, lwd=3, col=c('yellow',1))

#random forest 比不過svm

--------------------------------------------------------

? # ST4061 / ST6041

? # 2021-2022

? # Eric Wolsztynski

? # ...

? ##### Section 5: demo code for effect of kernel on SVM ####

? # Here we simulate 2D data that have a circular spatial

? # distribution, to see the effect of the choice of kernel

? # shape on decision boundaries

? # --------------------------------------------------------

rm(list=ls())

library(e1071)

# Simulate circular data...

# simulate 2-class circular data:

set.seed(4061)

n = 100

S1 = 15; S2 = 3

x1 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))

x2 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))

# corresponding 2D circular radii:

rads = sqrt(x1^2+x2^2)

# make up the 2 classes in terms of whether lower or greater than median radius:

c1 = which(rads<median(rads))

c2 = c(1:n)[-c1]

# now we apply scaling factors to further separate the

# 2 classes:

x1[c1] = x1[c1]/1.2

x2[c1] = x2[c1]/1.2

x1[c2] = x1[c2]*1.2

x2[c2] = x2[c2]*1.2

# label data according to class membership:

lab = rep(1,n)

lab[c2] = 2#lab里,c1對應(yīng)的位置為1;c2對應(yīng)的位置為2

par(mfrow=c(1,1))

plot(x1,x2,col=c(1,2)[lab], pch=c(15,20)[lab], cex=1.5)

# create final data frame:

x = data.frame(x1,x2)

y = as.factor(lab)

dat = cbind(x,y)

# apply SVMs with different choices of kernel shapes:

svmo.lin = svm(y~., data=dat, kernel='linear')

svmo.pol = svm(y~., data=dat, kernel='polynomial')

svmo.rad = svm(y~., data=dat, kernel='radial')

svmo.sig = svm(y~., data=dat, kernel='sigmoid')

plot(svmo.lin, dat, col=c("cyan","pink"), svSymbol=15)

plot(svmo.pol, dat, col=c("cyan","pink"), svSymbol=15)

plot(svmo.rad, dat, col=c("cyan","pink"), svSymbol=15)

plot(svmo.sig, dat, col=c("cyan","pink"), svSymbol=15)

#### NOTE: the code below is outside the scope of this course! 注意:下面的代碼超出了本課程的范圍!

#### It is used here for illustrations purposes only.

# this call format is easier when using predict():

svmo.lin = svm(x, y, kernel='linear', scale=F)

svmo.pol = svm(x, y, kernel='polynomial', scale=F)

svmo.rad = svm(x, y, kernel='radial', scale=F)

svmo.sig = svm(x, y, kernel='sigmoid', scale=F)

# evaluate the SVM boundaries on a regular 2D grid of points:

ng? ? = 50

xrg? = apply(x, 2, range)

x1g? = seq(xrg[1,1], xrg[2,1], length=ng)

x2g? = seq(xrg[1,2], xrg[2,2], length=ng)

xgrid = expand.grid(x1g, x2g)

plot(x, col=c(1,2)[y], pch=20)

abline(v=x1g, col=8, lty=1)

abline(h=x2g, col=8, lty=1)

#

ygrid.lin = predict(svmo.lin, xgrid)

ygrid.pol = predict(svmo.pol, xgrid)

ygrid.rad = predict(svmo.rad, xgrid)

ygrid.sig = predict(svmo.sig, xgrid)

par(mfrow=c(2,2), font.lab=2, font.axis=2)

CEX = .5

COLS = c(1,3)

DCOLS = c(2,4)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

# points(x[svmo.lin$index,], pch=21, cex=2)

points(svmo.lin$SV, pch=21, cex=2) # same as previous line!

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

points(x[svmo.pol$index,], pch=21, cex=2)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

points(x[svmo.rad$index,], pch=21, cex=2)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

points(x[svmo.sig$index,], pch=21, cex=2)

# Alternative plot:

par(mfrow=c(2,2), font.lab=2, font.axis=2)

CEX = .5

COLS = c(1,3)

DCOLS = c(2,4)

#

L1 = length(x1g)

L2 = length(x2g)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel")

bnds = attributes(predict(svmo.lin, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel")

bnds = attributes(predict(svmo.pol, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel")

bnds = attributes(predict(svmo.rad, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

#

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel")

bnds = attributes(predict(svmo.sig, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

# NB: naive Bayes decision boundary is obtained with

# contour(x1g, x2g, matrix(bnds, L1, L2), level=0.5, add=TRUE, col=4, lwd=2)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 5: SVM ####

# --------------------------------------------------------

rm(list=ls())

library(randomForest)

library(class)

library(pROC)

library(e1071)

library(caret)

library(ISLR)

###############################################################

#### Exercise 1: using SVM to classify (High Carseat sales dataset) ####

###############################################################

library(ISLR) # contains the dataset

# Recode response variable so as to make it a classification problem

High = ifelse(Carseats$Sales<=8, "No", "Yes")

CS = data.frame(Carseats, High)

CS$Sales = NULL

#construct dataset

x = CS

x$High = NULL

y = CS$High

# split the data into train+test (50%-50%):

n = nrow(CS)

set.seed(4061)

i.train = sample(1:n, 350)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

class(x.train)

class(y.train)

# ?svm : svm有兩種形式

# svmo = svm(xm, y.train, kernel='polynomial')##svm(x, y = NULL, scale = TRUE, type = NULL, kernel ="radial")

# svmo = svm(y~., data=dat.train)##svm(formula, data = NULL, ..., subset, na.action = na.omit, scale = TRUE)

#由于x必須喂一個matrix,y必須喂一個numeric,所以x.train=as.matrix(x.train),y.train=as.numeric(y.train)

# (3) Explain why this does not work:

svmo = svm(x.train, y.train, kernel='polynomial')

# >> The problem is the presence of categorical variables in

# the dataset. They must be "recoded" into numerical variables

# for svm() to analyse their spatial contribution.

# (4) Carry out the appropriate fix from your conclusion from (a).

# Then, fit two SVM models, one using a linear kernel, the other

# a polynomial kernel. Compare the two appropriately.

#將兩個SVM分類器適合于適當(dāng)?shù)摹鞍茨Α庇?xùn)練集,一個使用線性核函數(shù),另一個使用多項(xiàng)式核函數(shù)。

#修正的做法:

NC = ncol(x)

# x = x[,-c(NC-1,NC)] # take out the last two columns (predictors)

xm = model.matrix(y~.+0, data=x)#remove the intercept

xm.train = xm[i.train,]

xm.test = xm[-i.train,]

y.train = as.factor(y.train) # so that svm knows it's classification

svmo.lin = svm(xm.train, y.train, kernel='linear')

svmo.pol = svm(xm.train, y.train, kernel='polynomial')

svmy.lin = fitted(svmo.lin)#fitted:顯示x的對應(yīng)類別y

svmy.pol = fitted(svmo.pol)

table(y.train, fitted(svmo.lin))

table(y.train, fitted(svmo.pol))

# (5) Comparison...

# * visual (there are better ways of visualising!):

par(mfrow=c(1,3))

yl = as.integer(y=="Yes")+1

plot(apply(xm,2,scale), pch=c(15,20)[yl], col=c(1,4)[yl],

? ? cex=c(1.2,2)[yl], main="The data")

#

plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train],

? ? cex=1, main="linear")

points(svmo.lin$SV, pch=5, col=2, cex=1.2)

#

plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train],

? ? cex=1, main="polynomial")

points(svmo.pol$SV, pch=5, col=2, cex=1.2)

# * in terms of training fit:

svmy.lin = fitted(svmo.lin)

svmy.pol = fitted(svmo.pol)

table(y.train, svmy.lin)

table(y.train, svmy.pol)

# * test error:

pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE)

pred.pol = predict(svmo.pol, newdata=xm.test)

# ... the above does not work well:

summary(pred.lin)

# --> these are not probabilities! That's because we need to specify

# ", probability=TRUE" also when fitting the SVM, in order to enable

# probabilities to be computed and returned...

# 這些都不是概率!這是因?yàn)槲覀冊跀M合SVM時也需要指定“,probability=TRUE”,以便能夠計算和返回概率……

# SO IF WE WANT TO GENERATE TEST-SET PREDICTIONS, THIS IS THE WAY:

svmo.lin = svm(xm.train, y.train, kernel='linear', probability=TRUE)

svmo.pol = svm(xm.train, y.train, kernel='polynomial', probability=TRUE)

pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE)

pred.pol = predict(svmo.pol, newdata=xm.test, probability=TRUE)

y.test = as.factor(y.test)

confusionMatrix(y.test, pred.lin)

confusionMatrix(y.test, pred.pol)

# * AUC (we need to extract P(Y=1|X))

p.lin = attributes(pred.lin)$probabilities[,2]

p.pol = attributes(pred.pol)$probabilities[,2]

y.test = as.factor(y.test)

roc(response=y.test, predictor=p.lin)$auc

roc(response=y.test, predictor=p.pol)$auc

#sensitivity和specificity都很高

###############################################################

#### Exercise 2: 3-class problem (iris dataset) ####

###############################################################

x = iris

x$Species = NULL

y = iris$Species

set.seed(4061)

n = nrow(x)

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

# (a)

plot(x.train[,1:2], pch=20, col=c(1,2,4)[as.numeric(y.train)], cex=2)

# (b)

dat = data.frame(x.train, y=as.factor(y.train))

svmo.lin = svm(y~., data=dat, kernel='linear')

svmo.pol = svm(y~., data=dat, kernel='polynomial')

svmo.rad = svm(y~., data=dat, kernel='radial')

#

# number of support vectors:

summary(svmo.lin)

summary(svmo.pol)

summary(svmo.rad)

#The number of support vectors less, the more complicated controversy.

# test error:

pred.lin = predict(svmo.lin, newdata=x.test)

pred.pol = predict(svmo.pol, newdata=x.test)

pred.rad = predict(svmo.rad, newdata=x.test)

cm.lin = confusionMatrix(y.test, pred.lin)

cm.pol = confusionMatrix(y.test, pred.pol)

cm.rad = confusionMatrix(y.test, pred.rad)

c(cm.lin$overall[1], cm.pol$overall[1], cm.rad$overall[1])

#rad more accurcte.

# (c) tune the model(via cross-validation)...

set.seed(4061)

svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,

? ? ? ? ? ? ? ? ? ? ? kernel='radial',

? ? ? ? ? ? ? ? ? ? ? ranges=list(cost=10^(-2:2), gamma=c(0.5,1,1.5,2)))

print(svm.tune)

names(svm.tune)

# retrieve optimal hyper-parameters

bp = svm.tune$best.parameters

# use these to obtain final SVM fit:

svmo.rad.tuned = svm(y~., data=dat, kernel='radial',

? ? ? ? ? ? ? ? ? ? cost=bp$cost, gamma=bp$gamma)

summary(svmo.rad)

summary(svmo.rad.tuned)#Not changed much,means it's got more to do with the shape of the kernel itself

#, rather than how the model is tunes for this dataset.

# test set predictions from tuned SVM model:

pred.rad.tuned = predict(svmo.rad.tuned, newdata=x.test)

cm.rad.tuned = confusionMatrix(y.test, pred.rad.tuned)

c(cm.rad$overall[1], cm.rad.tuned$overall[1])

# so maybe not an exact science!?

# ... in fact these performances are comparable, bear in mind CV assessment is

# itself subject to variability...

#These are estimates of prediction performance, there is uncertainty related to the points where the value you have here,

#which means when you see 100% or 96%, you are really looking at the same thing. So you need to look at the confidence interval

#around these value to understand what you are really seeing a difference or not.

###############################################################

#### Exercise 3: SVM using caret ####

###############################################################

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change into a classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "High","Low"))

# Data partition

set.seed(4061)

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set

x = dat.train # training set

x$Salary = NULL

y = as.factor(dat.train$Salary)

### Random forest

rf.out = caret::train(Salary~., data=dat.train, method='rf')

rf.pred = predict(rf.out, dat.validation)

rf.cm = confusionMatrix(reference=dat.validation$Salary, data=rf.pred, mode="everything")

### SVM (linear)

svm.out = caret::train(Salary~., data=dat.train, method="svmLinear")

svm.pred = predict(svm.out, dat.validation)

svm.cm = confusionMatrix(reference=dat.validation$Salary, data=svm.pred, mode="everything")

# modelLookup('svmRadial')

### SVM (radial)

svmR.out = caret::train(Salary~., data=dat.train, method="svmRadial")

svmR.pred = predict(svmR.out, dat.validation)

svmR.cm = confusionMatrix(reference=dat.validation$Salary, data=svmR.pred, mode="everything")

perf = rbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall)

row.names(perf) = c("RF","SVM.linear","SVM.radial")

round(perf, 4)

perf = cbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall)

colnames(perf) = c("RF","SVM.linear","SVM.radial")

round(perf, 4)

#評價就看accuracy,結(jié)合一開始的圖像,圖像不像是線性的,所以rf應(yīng)該比較好

###############################################################

#### Exercise 4: SVM-based regression ####

###############################################################

x = iris

x$Sepal.Length = NULL

y = iris$Sepal.Length#using Sepal.Length as response variable

pairs(iris[,1:4])

?pairs#pairs生成一個配對的散點(diǎn)圖矩陣,矩陣由X中的每列的列變量對其他各列列變量的散點(diǎn)圖組成

set.seed(4061)

n = nrow(x)

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

dat.train = cbind(x.train,y=y.train)

# specify statistical training settings:

ctrl = caret::trainControl(method='cv')

# perform statistical training:

svm.o = caret::train(y~., data=dat.train, method="svmLinear",

? ? ? ? ? ? ? ? ? ? trControl=ctrl)#trControl=ctrl

# compute test set predictions:

svm.p = predict(svm.o, newdata=x.test)

# and corresponding MSE:

mean( (y.test-svm.p)^2 )

par(pty='s') #makes or a square plot box

rr = range(c(y.test, svm.p))

plot(y.test, svm.p, pch=20,

? ? xlab="true values", ylab="predicted values",

? ? xlim=rr,ylim=rr)

abline(a=0,b=1)

#Here is a very good enlightenment

#### Section 6 神經(jīng)網(wǎng)絡(luò)####

####分類——?dú)w一化;回歸——標(biāo)準(zhǔn)化####

#learning rate:is applied to sort of calibrate the speed of the learning process data.

#DL deep learning vs ML meachine learning

------------------------------------------------------------

#### Example 1 : iris data with neuralnet ####

#install.packages('neuralnet')

library(neuralnet)

n = nrow(iris)

dat = iris[sample(1:n), ] # shuffle initial dataset

NC = ncol(dat)

nno = neuralnet(Species~., data=dat, hidden=c(6,5))#知道我們要建幾層,每層幾個節(jié)點(diǎn)時

#hidden? 第一層6個,第二層5個的神經(jīng)網(wǎng)絡(luò)

plot(nno,

? ? information=FALSE,#不要標(biāo)數(shù)據(jù)

? ? col.entry='red',

? ? col.out='green',

? ? show.weights=FALSE)

plot(nno,

? ? information=TRUE,#要標(biāo)數(shù)據(jù)

? ? col.entry='red',

? ? col.out='green',

? ? show.weights=TRUE)

#(1)‘blue’ bits are bias

#(2)在黑色線條上的數(shù)字是weights, 可以是positive, 也可以是negative的

#### Example 2: single layer NN - regression - effect of scaling ####

library(nnet)? ? # implements single layer NNs

library(mlbench) # includes dataset BostonHousing

#install.packages('mlbench')

data(BostonHousing) # load the dataset

#View(BostonHousing)

# train neural net

n = nrow(BostonHousing)

itrain = sample(1:n, round(.7*n), replace=FALSE)#要70%的數(shù)據(jù)并取整作為訓(xùn)練樣本

nno = nnet(medv~., data=BostonHousing, subset=itrain, size=5)#size 隱藏層中的單元數(shù)

#只知道幾個神經(jīng)元,但不知道有幾層

#?nnet

summary(nno$fitted.values)

#We saw the fitted value are all 1, it because the information was not scaled, we should do scale first.

# the above output indicates the algorithm did not

# converge, probably due to explosion of the gradients...

#都是1,說明算法沒有收斂,需要?dú)w一化值(50 是這個數(shù)據(jù)的最大值,除以最大值):

# We try again, this time normalizing the values

# (50 is the max value for this data, see its range):

nno = nnet(medv/50~., data=BostonHousing, subset=itrain, size=5)

summary(nno$fitted.values)# there was thus a need to normalise the response variable...

# 測試神經(jīng)網(wǎng)絡(luò)

preds = predict(nno, newdata=BostonHousing[-itrain,]) * 50 # (we multiply by 50 to fall back to original data domain)

#(由于之前除以了50,我們乘以 50 以回退到原始數(shù)據(jù)域)

summary(preds)

# RMSE:偏方誤差

sqrt(mean((preds-BostonHousing$medv[-itrain])^2))

# compare with lm():

lmo = lm(medv~., data=BostonHousing, subset=itrain)

lm.preds = predict(lmo, newdata= BostonHousing[-itrain,])

# RMSE:

sqrt(mean((lm.preds-BostonHousing$medv[-itrain])^2))

#Compare with lm, we have a lower test RMSE.

#平價的時候看一下length(itrain)和dim(BostonHousing),看有沒有足夠的訓(xùn)練樣本和測試樣本

# Further diagnostics may highlight various aspects of the

# model fit - always run these checks!每次都要檢查跟lm的比較

# 進(jìn)一步的診斷可能會突出模型擬合的各個方面 - 始終運(yùn)行這些檢查!

par(mfrow=c(2,2))

################################################################################

#PLOT(1): residuals form LM against NN

plot(lmo$residuals, nno$residuals*50)

abline(a=0, b=1, col="limegreen")

#there are some extreme residuals from LM

################################################################################

#PLOT(2): residuals from LM against Original Train Data - no relationship

plot(BostonHousing$medv[itrain], lmo$residuals, pch=20)

################################################################################

#PLOT(3): residuals from LM against Original Train Data (in grey)

#? ? ? ? plus(+) residuals from NN against Original Train Data

plot(BostonHousing$medv[itrain], lmo$residuals, pch=20, col=8)

points(BostonHousing$medv[itrain], nno$residuals*50, pch=20)

################################################################################

#PLOT(4): QQ plot

#In general, Noise is assumed to be normal distributed or Gaussian...

#But in NN, we do not make this assumption. Since between X(input) and Y(output), we hope to use some non-linear function.

#But QQ plot is still a useful tool.

qqnorm(nno$residuals)

abline(a=mean(nno$residuals), b=sd(nno$residuals), col=2)

#### Example 3: effect of tuning parameters (iris data) ####

rm(list=ls())

n = nrow(iris)

# 像往常一樣打亂初始數(shù)據(jù)集(刪除第 4 個值,減少數(shù)據(jù)量,使數(shù)據(jù)更難準(zhǔn)確)打亂數(shù)據(jù)、重排 #移除Petal.Width數(shù)據(jù)

dat = iris[sample(1:n),-4]

NC = ncol(dat)

#data scaling 不是變成0,1分布,而是將數(shù)據(jù)縮放至為 [0,1]

####scale function: y_normalized = (y-min(y)) / (max(y)-min(y)):####

#dat離第四列character型的數(shù)據(jù)不用歸一化,所以dat[,-NC]

mins = apply(dat[,-NC],2,min)

maxs = apply(dat[,-NC],2,max)

dats = dat

dats[,-NC] = scale(dats[,-NC],center=mins,scale=maxs-mins)

# 設(shè)置訓(xùn)練樣本:

itrain = sample(1:n, round(.7*n), replace=FALSE)

nno = nnet(Species~., data=dats, subset=itrain, size=5)

# 預(yù)測:

nnop = predict(nno, dats[-itrain,])

head(nnop)#返回向量、矩陣、表格、數(shù)據(jù)框或函數(shù)的第一部分? ?

#這是從概率中獲取預(yù)測標(biāo)簽的一種方法:

preds = max.col(nnop) #找到矩陣每一行的最大位置在第幾列,用來做混淆矩陣

#(上面的行為每一行選擇概率最高的列,即每個觀察值)或者我們可以直接使用它:

preds = predict(nno, dats[-itrain,], type='class')

tbp = table(preds, dats$Species[-itrain])#混淆矩陣

sum(diag(tbp))/sum(tbp) #準(zhǔn)確率

# #nnet里size怎樣影響正確率

####找到最合適的size####

# 在這里,我們嘗試使用 1 到 10 的尺寸進(jìn)行說明,但您可以隨意使用這些值!

sizes = c(1:10)

rate = numeric(length(sizes)) # 訓(xùn)練集分類準(zhǔn)確率

ratep = numeric(length(sizes)) # 測試集分類準(zhǔn)確率

for(d in 1:length(sizes)){

? nno = nnet(Species~., data=dats, subset=itrain,

? ? ? ? ? ? size=sizes[d])

? tb = table(max.col(nno$fitted.values), dats$Species[itrain])

? rate[d] = sum(diag(tb))/sum(tb)

? # now looking at test set predictions

? nnop = predict(nno, dats[-itrain,])

? tbp = table(max.col(nnop), dats$Species[-itrain])

? ratep[d] = sum(diag(tbp))/sum(tbp)

}

plot(rate, pch=20, t='b', xlab="layer size", ylim=range(c(rate,ratep)))

points(ratep, pch=15, t='b', col=2)

legend('bottomright', legend=c('training','testing'),

? ? ? pch=c(20,15), col=c(1,2), bty='n')

# 注意訓(xùn)練集和測試集的表現(xiàn)不一定相似......

#由此找到最好最合適的size

#nnet里decay(權(quán)重衰減參數(shù),默認(rèn)為 0) 怎樣影響正確率

decays = seq(1,.0001,lengt=11)

rate = numeric(length(decays)) # train-set classification rate

ratep = numeric(length(decays)) # test-set classification rate

for(d in 1:length(decays)){

? # fit NN with that particular decay value (decays[d]):

? nno = nnet(Species~., data=dats, subset=itrain, size=10,

? ? ? ? ? ? decay=decays[d])

? # corresponding train set confusion matrix:

? tb = table(max.col(nno$fitted.values), dats$Species[itrain])

? rate[d] = sum(diag(tb))/sum(tb)

? # now looking at test set predictions:

? nnop = predict(nno, dats[-itrain,])

? tbp = table(max.col(nnop), dats$Species[-itrain])

? ratep[d] = sum(diag(tbp))/sum(tbp)

}

plot(decays, rate, pch=20, t='b', ylim=range(c(rate,ratep)))

points(decays, ratep, pch=15, t='b', col=2)

legend('topright', legend=c('training','testing'),

? ? ? pch=c(20,15), col=c(1,2), bty='n')

rm(list=ls())

######Exercise ######

#### Exercise 1####

# 1. What type of neural network does this code implement? FFNN

#有兩種函數(shù)能實(shí)現(xiàn)神經(jīng)網(wǎng)絡(luò),一種是step function——大于某一個值是1,小于是0,非黑即白;一種是activation function——有確切的數(shù)字輸出

# 2.

library(MASS)

library(neuralnet)

# --- NN with one 10-node hidden layer

nms = names(Boston)[-14]

f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))

# fit a single-layer, 10-neuron NN:

set.seed(4061)

out.nn = neuralnet(f, data=Boston, hidden=c(10), rep=5,

? ? ? ? ? ? ? ? ? linear.output=FALSE)

#plot(out.nn, information=TRUE, col.entry='red', col.out='green',show.weights=TRUE)

#create single-hidden layer neural network and repeat 5 times

# without using an activation function:

set.seed(4061)

out.nn.lin = neuralnet(f, data=Boston, hidden=c(10), rep=1,

? ? ? ? ? ? ? ? ? ? ? linear.output=TRUE)

# Warning message: 算法在 stepmax 內(nèi)的 1 次重復(fù)中沒有收斂,所以需要運(yùn)行此代碼兩遍

# Algorithm did not converge in 1 of 1 repetition(s) within the stepmax.

#線性輸出:

set.seed(4061)

out.nn.tanh = neuralnet(f, data=Boston, hidden=c(10), rep=5,

? ? ? ? ? ? ? ? ? ? ? ? linear.output=FALSE, act.fct='tanh')

p1 = predict(out.nn, newdata=Boston)

p2 = predict(out.nn.tanh, newdata=Boston)

sqrt(mean((p1-Boston$medv)^2))

sqrt(mean((p2-Boston$medv)^2))

#參數(shù):

#linear.output: logical,線性輸出為TRUE,nonlinear為FALSE

#rep: 神經(jīng)網(wǎng)絡(luò)訓(xùn)練的重復(fù)次數(shù)

#act.fct: 一個可微函數(shù),用于平滑協(xié)變量或神經(jīng)元與權(quán)重的叉積的結(jié)果

#act.fct: 默認(rèn)“l(fā)ogistic”,也可以是“tanh”

#### Exercise 2 ####

library(neuralnet)

set.seed(4061)

n = nrow(iris)

dat = iris[sample(1:n), ] # shuffle initial dataset

NC = ncol(dat)

nno = neuralnet(Species~., data=dat, hidden=c(6,5))

plot(nno)

#### Exercise 3 #####

#Load dataset MASS::Boston and perform a 70%-30% split for training and test sets

#respectively. Use set.seed(4061) when splitting the data and also every time you run a

#neural network.

#1. Compare single-layer neural network fits from the neuralnet and nnet libraries.

#Can you explain any difference you may find?

# 2. Change the "threshold" argument value to 0.001 in the call to neuralnet, and

#comment on your findings (this run might take a bit more time to converge)

#加載數(shù)據(jù)集 MASS::Boston 并分別對訓(xùn)練集和測試集執(zhí)行 70%-30% 的拆分。

#1. 比較來自神經(jīng)網(wǎng)絡(luò)和 nnet 庫的單層神經(jīng)網(wǎng)絡(luò)擬合。

#解釋可能發(fā)現(xiàn)的任何區(qū)別嗎?

#2. 在對神經(jīng)網(wǎng)絡(luò)的調(diào)用中將“閾值”參數(shù)值更改為 0.001,

#并評論發(fā)現(xiàn)(此運(yùn)行可能需要更多時間才能收斂)

#當(dāng)外界刺激達(dá)到一定的閥值時,神經(jīng)元才會受刺激,影響下一個神經(jīng)元。

#超過閾值,就會引起某一變化,不超過閾值,無論是多少,都不產(chǎn)生影響

rm(list=ls())

library(neuralnet)

library(nnet)? ? # implements single layer NNs

library(MASS) # includes dataset BostonHousing

data(Boston) # load the dataset

# train neural nets

n = nrow(Boston)

itrain = sample(1:n, round(.7*n), replace=FALSE)

dat = Boston

dat$medv = dat$medv/50 #歸一化

dat.train = dat[itrain,]

dat.test = dat[-itrain,-14]#自變量的測試樣本

y.test = dat[-itrain,14]#因變量的測試樣本

#nnet 單層五個神經(jīng)元

nno1 = nnet(medv~., data=dat.train, size=5, linout=TRUE)

fit1 = nno1$fitted.values

mean((fit1-dat.train$medv)^2) #偏差

#neuralnet 單層五個神經(jīng)元

nno2 = neuralnet(medv~., data=dat.train, hidden=c(5), linear.output=TRUE)

fit2 = predict(nno2, newdata=dat.train)[,1]

mean((fit2-dat.train$medv)^2) #偏差

##閾值threshold0.0001的neuralnet

nms = names(dat)[-14]

f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))#下面所用的函數(shù)太長,所以先寫出來

nno3 = neuralnet(f, data=dat.train, hidden=5, threshold=0.0001)#threshold 閾值####f跟medv有啥區(qū)別??####

#Threshold in 'neuralnet' is originally 0.01. Now we set it to be 0.0001.

fit3 = predict(nno3, newdata=dat.train)[,1]

mean((fit3-dat.train$medv)^2)#0.005276877 #even much better!

#用mean來看模型能不能用,mean不能太大

# test neural nets predict一定要乘回去50

y.test = y.test*50

p1 = predict(nno1, newdata=dat.test)*50

p2 = predict(nno2, newdata=dat.test)*50

p3 = predict(nno3, newdata=dat.test)*50

mean((p1-y.test)^2)

mean((p2-y.test)^2)

mean((p3-y.test)^2)

#test的mean用來看哪個模型更好

# explain these differences?

names(nno1)#nnet

names(nno2)#neuralnet

# nnet:

# - activation function: logistic

# - algorithm: BFGS in optim

# - decay: 0

# - learning rate: NA

# - maxit: 100

# neuralnet:

# - activation function: logistic

# - algorithm: (some form of) backpropagation

# - decay: ?

# - learning rate: depending on algorithm

# - maxit:?

# so what is it?

#nnet里的activation function:

#不是所有的信號都要做反應(yīng),需要activation function去看需要對哪些信號作出反應(yīng)

#hide層和output層都有activation function

#hide層的activation function是由act.fun決定的——tanch正切或者logistic

#output層的activation function是由linout決定的

#做regression時一般linout=T,表明output層的activation function是identical的,就是輸入是啥輸出就是啥,不用做改變

#linout=F output的activation function是logistic,輸出值要變成邏輯變量

#默認(rèn)值hide和output都是logistic

####? Exercise 4 ####

#Fit a single-layer feed-forward neural network using nnet to

#Report on fitted values.

#使用 nnet 擬合單層前饋神經(jīng)網(wǎng)絡(luò)

#報告擬合值。

rm(list=ls())

library(caret)

library(neuralnet)

library(nnet)

library(ISLR)

#set up the data (take a subset of the Hitters dataset)設(shè)置數(shù)據(jù)(獲取 Hitters 數(shù)據(jù)集的子集)

dat = na.omit(Hitters) #返回刪除NA后的向量a? 因?yàn)樵摂?shù)據(jù)里有缺失值

#is.na(Hitters)

n = nrow(dat)

NC = ncol(dat)

# Then try again after normalizing the response variable to [0,1]:將響應(yīng)變量歸一化為 [0,1]

dats = dat

dats$Salary = (dat$Salary-min(dat$Salary)) / diff(range(dat$Salary))

# train neural net

itrain = sample(1:n, round(.7*n), replace=FALSE)

dat.train = dat[itrain,]

dats.train = dats[itrain,]

dat.test = dat[-itrain,]

dats.test = dats[-itrain,]

#data Salary which is not scaled: do not work

#歸一化前 dat是歸一化前,dats是歸一化后

nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1))

summary(nno$fitted.values)

#data Salary is scaled, but no regularization: do not work either

#歸一化后

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0))

summary(nno.s$fitted.values)

#data Salary is scaled, and also have regularization progress: works!

#歸一化后

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1))

summary(nno.s$fitted.values)

#Our last attempt above was a success.

#But we should be able to get a proper fit even for decay=0...

#what's going on? Can you get it to work?

#改進(jìn)!添加系數(shù)linout=1

#(A1) Well, it's one of these small details in how you call a function;

#here we have to specify 'linout=1' because we are considering a regression problem

#for regression problem: class k = 1

#data Salary which is not scaled:

set.seed(4061)

nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1), linout=1)

summary(nno$fitted.values)

#data Salary which is scaled, but with no decay:

set.seed(4061)

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0), linout=1)

summary(nno.s$fitted.values)

#data Salary which is scaled, and also with decay:

set.seed(4061)

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1), linout=1)

summary(nno.s$fitted.values)

#改進(jìn)!寫function

# (A2) but let's do the whole thing again more cleanly...

# 重新編碼和放縮數(shù)據(jù)對結(jié)果影響的比較

# re-encode and scale dataset properly? 正確重新編碼和縮放數(shù)據(jù)集

myrecode <- function(x){

? # function recoding levels into numerical values

? #函數(shù)將級別重新編碼為數(shù)值

? if(is.factor(x)){

? ? levels(x)

? ? return(as.numeric(x))

? } else {

? ? return(x)

? }

}

myscale <- function(x){

? # function applying normalization to [0,1] scale

? #對 [0,1] 尺度應(yīng)用歸一化的函數(shù)

? minx = min(x,na.rm=TRUE)

? maxx = max(x,na.rm=TRUE)

? return((x-minx)/(maxx-minx))

}

datss = data.frame(lapply(dat,myrecode))

datss = data.frame(lapply(datss,myscale))

# replicate same train-test split:

#復(fù)制相同的訓(xùn)練測試拆分:

datss.train = datss[itrain,]

datss.test = datss[-itrain,]

nno.ss.check = nnet(Salary~., data=datss.train, size=10, decay=0, linout=1)

summary(nno.ss.check$fitted.values)

# use same scaled data but with decay as before:

#使用相同的縮放數(shù)據(jù),但與以前一樣衰減:

nno.ss = nnet(Salary~., data=datss.train, size=10, decay=c(0.1), linout=1)

summary(nno.ss$fitted.values)

# evaluate on test data (with same decay for both models):

#評估測試數(shù)據(jù)(兩個模型的衰減相同):

datss.test$Salary - dats.test$Salary

pred.s = predict(nno.s, newdata=dats.test)

pred.ss = predict(nno.ss, newdata=datss.test)

mean((dats.test$Salary-pred.s)^2)

mean((datss.test$Salary-pred.ss)^2)

#Feed-forward neural network(FFNN)

#? Single or multiplelayers 單層或多層

#? Forward propagationonly 僅前向傳播

#? Number of layers determines function complexity 層數(shù)決定功能復(fù)雜度

#? Typically uses a nonlinear activation function 通常使用非線性激活函數(shù)

#? Some definitions specify a unique hidden layer, others allow any number of layers一些定義指定一個唯一的隱藏層,其他定義允許任意數(shù)量的層

#Multilayer Perceptron(MLP)

#Recurrent neural network(RNN)

#Long short-term memory neural network(LSTMNN)

#Convolutional neural network(CNN)

#### Exercise 6: Olden index #####

#使用 NeuralNetTools::olden() 計算以下數(shù)據(jù)集的變量重要性,

#每次擬合一個 7 神經(jīng)元單層 FFNN (nnet):

#1. 鳶尾花數(shù)據(jù)集(使用全套);

#2. 波士頓數(shù)據(jù)集

#。 與從隨機(jī)森林獲得的變量重要性評估進(jìn)行比較。

rm(list=ls())

library(nnet)

library(NeuralNetTools)

library(randomForest)

library(MASS)

myscale <- function(x){

? minx = min(x,na.rm=TRUE)

? maxx = max(x,na.rm=TRUE)

? return((x-minx)/(maxx-minx))

}

# (1) Iris data

# shuffle dataset...

set.seed(4061)

n = nrow(iris)

dat = iris[sample(1:n),]

# rescale predictors...

dat[,1:4] = myscale(dat[,1:4])

# fit Feed-Forward Neural Network...

set.seed(4061)

nno = nnet(Species~., data=dat, size=c(7), linout=FALSE, entropy=TRUE)

pis = nno$fitted.values

matplot(pis, col=c(1,2,4), pch=20)

y.hat = apply(pis, 1, which.max) # fitted values

table(y.hat, dat$Species)

# compute variable importance...

#神經(jīng)網(wǎng)絡(luò)中輸入變量的相對重要性作為原始輸入隱藏、隱藏輸出連接權(quán)重的乘積之和

vimp.setosa = olden(nno, out_var='setosa', bar_plot=FALSE)

vimp.virginica = olden(nno, out_var='virginica', bar_plot=FALSE)

vimp.versicolor = olden(nno, out_var='versicolor', bar_plot=FALSE)

names(vimp.setosa)

par(mfrow=c(1,2))

plot(iris[,3:4], pch=20, col=c(1,2,4)[iris$Species], cex=2)

plot(iris[,c(1,3)], pch=20, col=c(1,2,4)[iris$Species], cex=2)

dev.new()

plot(olden(nno, out_var='setosa'))

plot(olden(nno, out_var='virginica'))

plot(olden(nno, out_var='versicolor'))

v.imp = cbind(vimp.setosa$importance, vimp.virginica$importance, vimp.versicolor$importance)

rownames(v.imp) = names(dat)[1:4]

colnames(v.imp) = levels(dat$Species)

(v.imp)

#正負(fù)值代表positive 還是nagative effect

#絕對值越大,自變量對因變量的影響越大

# fit RF...

set.seed(4061)

rfo = randomForest(Species~., data=dat, ntrees=1000)

rfo$importance

# how can we compare variable importance assessments?

cbind(apply(v.imp, 1, sum), #所有自變量放在一起對y的影響重要性(有抵消)

? ? ? apply(abs(v.imp), 1, sum), #取絕對值

? ? ? rfo$importance)

#三個值都大的自變量have overall contribution

# (2) Boston data

set.seed(4061)

n = nrow(Boston)

dat = Boston[sample(1:n),]

# rescale predictors...

dats = myscale(dat)

dats$medv = dat$medv/50

set.seed(4061)

nno = nnet(medv~., data=dats, size=7, linout=1)

y.hat = nno$fitted.values

plot(y.hat*50, dat$medv)#從圖中看準(zhǔn)確度

mean((y.hat*50-dat$medv)^2)#偏差

v.imp = olden(nno, bar_plot=FALSE)

plot(v.imp)

# fit RF...里面的重要性,跟神經(jīng)網(wǎng)絡(luò)擬合得到的重要性進(jìn)行比較

set.seed(4061)

rfo = randomForest(medv~., data=dat, ntrees=1000)

rfo$importance

# how can we compare variable importance assessments?我們?nèi)绾伪容^變量重要性評估?

cbind(v.imp, rfo$importance)

round(cbind(v.imp/sum(abs(v.imp)),

? ? ? ? ? ? rfo$importance/sum(rfo$importance)),3)*100 #重要性的百分比

#一些變量兩邊數(shù)都大,突出standout

#一些變量差距兩邊很大

# should we use absolute values of Olden's index?應(yīng)該使用奧爾登指數(shù)的絕對值

par(mfrow=c(2,1))

barplot(abs(v.imp[,1]), main="importance from NN",

? ? ? ? names=rownames(v.imp), las=2)

barplot(rfo$importance[,1], main="importance from RF",

? ? ? ? names=rownames(v.imp), las=2)

# 作圖比較

# or possibly normalize across all values for proportional contribution?

par(mfrow=c(2,1))

NNN = sum(abs(v.imp[,1]))

NRF = sum(abs(rfo$importance[,1]))

barplot(abs(v.imp[,1])/NNN, main="importance from NN",

? ? ? ? names=rownames(v.imp), las=2)

barplot(rfo$importance[,1]/NRF, main="importance from RF",

? ? ? ? names=rownames(v.imp), las=2)

# 把上面兩個圖合在一起looks alright... now make it a nicer comparative plot :)

par(font=2, font.axis=2)

imps = rbind(NN=abs(v.imp[,1])/NNN, RF=rfo$importance[,1]/NRF)

cols = c('cyan','pink')

barplot(imps, names=colnames(imps), las=2, beside=TRUE,

? ? ? ? col=cols,

? ? ? ? ylab="relative importance (%)",

? ? ? ? main="Variable importance from NN and RF")

legend("topleft", legend=c('NN','RF'), col=cols, bty='n', pch=15)

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • # -------------------------------------------------------...
    woaishangxue閱讀 1,039評論 0 0
  • library(ISLR) library(glmnet) library(class) # contains k...
    woaishangxue閱讀 690評論 0 0
  • #Q2 #(a) dat.nona = na.omit(airquality) dat = airquality ...
    woaishangxue閱讀 321評論 0 0
  • ####Section 7 Feature Selection #### library(ISLR) librar...
    TXJY閱讀 530評論 0 0
  • install.packages(glmnet) install.packages(survival) insta...
    TXJY閱讀 492評論 0 0

友情鏈接更多精彩內(nèi)容