The data set comes from UCI Machine Learning Repository: Online News Popularity Data Set.

The goal of this project is to classify online news into popular or not popular category. The code shows you how to build KNN, CART, C5.0 and Random Forest models.

Load Library

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(miscTools)
library(caret) # Accuracy
## Loading required package: lattice
## Loading required package: ggplot2
library(ROCR) #ROC curve
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(e1071)
library(C50) #C5.0
library(rpart) #R part
library(rpart.plot)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.0.5 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
library(ggplot2)

Load Data

news=read.csv("OnlineNewsPopularity.csv")
news=news[!news$n_unique_tokens==701,]
summary(news)
##                                                              url       
##  http://mashable.com/2013/01/07/amazon-instant-video-browser/  :    1  
##  http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/   :    1  
##  http://mashable.com/2013/01/07/apple-40-billion-app-downloads/:    1  
##  http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/      :    1  
##  http://mashable.com/2013/01/07/att-u-verse-apps/              :    1  
##  http://mashable.com/2013/01/07/beewi-smart-toys/              :    1  
##  (Other)                                                       :39637  
##    timedelta     n_tokens_title n_tokens_content n_unique_tokens 
##  Min.   :  8.0   Min.   : 2.0   Min.   :   0.0   Min.   :0.0000  
##  1st Qu.:164.0   1st Qu.: 9.0   1st Qu.: 246.0   1st Qu.:0.4700  
##  Median :339.0   Median :10.0   Median : 409.0   Median :0.5400  
##  Mean   :354.5   Mean   :10.4   Mean   : 546.5   Mean   :0.5305  
##  3rd Qu.:542.0   3rd Qu.:12.0   3rd Qu.: 716.0   3rd Qu.:0.6100  
##  Max.   :731.0   Max.   :23.0   Max.   :8474.0   Max.   :1.0000  
##                                                                  
##  n_non_stop_words n_non_stop_unique_tokens   num_hrefs     
##  Min.   :0.0000   Min.   :0.0000           Min.   :  0.00  
##  1st Qu.:1.0000   1st Qu.:0.6300           1st Qu.:  4.00  
##  Median :1.0000   Median :0.6900           Median :  8.00  
##  Mean   :0.9702   Mean   :0.6728           Mean   : 10.88  
##  3rd Qu.:1.0000   3rd Qu.:0.7500           3rd Qu.: 14.00  
##  Max.   :1.0000   Max.   :1.0000           Max.   :304.00  
##                                                            
##  num_self_hrefs       num_imgs         num_videos    average_token_length
##  Min.   :  0.000   Min.   :  0.000   Min.   : 0.00   Min.   :0.000       
##  1st Qu.:  1.000   1st Qu.:  1.000   1st Qu.: 0.00   1st Qu.:4.480       
##  Median :  3.000   Median :  1.000   Median : 0.00   Median :4.660       
##  Mean   :  3.293   Mean   :  4.543   Mean   : 1.25   Mean   :4.548       
##  3rd Qu.:  4.000   3rd Qu.:  4.000   3rd Qu.: 1.00   3rd Qu.:4.850       
##  Max.   :116.000   Max.   :128.000   Max.   :91.00   Max.   :8.040       
##                                                                          
##   num_keywords    data_channel_is_lifestyle data_channel_is_entertainment
##  Min.   : 1.000   Min.   :0.00000           Min.   :0.000                
##  1st Qu.: 6.000   1st Qu.:0.00000           1st Qu.:0.000                
##  Median : 7.000   Median :0.00000           Median :0.000                
##  Mean   : 7.224   Mean   :0.05295           Mean   :0.178                
##  3rd Qu.: 9.000   3rd Qu.:0.00000           3rd Qu.:0.000                
##  Max.   :10.000   Max.   :1.00000           Max.   :1.000                
##                                                                          
##  data_channel_is_bus data_channel_is_socmed data_channel_is_tech
##  Min.   :0.0000      Min.   :0.0000         Min.   :0.0000      
##  1st Qu.:0.0000      1st Qu.:0.0000         1st Qu.:0.0000      
##  Median :0.0000      Median :0.0000         Median :0.0000      
##  Mean   :0.1579      Mean   :0.0586         Mean   :0.1853      
##  3rd Qu.:0.0000      3rd Qu.:0.0000         3rd Qu.:0.0000      
##  Max.   :1.0000      Max.   :1.0000         Max.   :1.0000      
##                                                                 
##  data_channel_is_world   kw_min_min       kw_max_min       kw_avg_min     
##  Min.   :0.0000        Min.   : -1.00   Min.   :     0   Min.   :   -1.0  
##  1st Qu.:0.0000        1st Qu.: -1.00   1st Qu.:   445   1st Qu.:  141.8  
##  Median :0.0000        Median : -1.00   Median :   660   Median :  235.5  
##  Mean   :0.2126        Mean   : 26.11   Mean   :  1154   Mean   :  312.4  
##  3rd Qu.:0.0000        3rd Qu.:  4.00   3rd Qu.:  1000   3rd Qu.:  357.0  
##  Max.   :1.0000        Max.   :377.00   Max.   :298400   Max.   :42827.9  
##                                                                           
##    kw_min_max       kw_max_max       kw_avg_max       kw_min_avg  
##  Min.   :     0   Min.   :     0   Min.   :     0   Min.   :  -1  
##  1st Qu.:     0   1st Qu.:843300   1st Qu.:172844   1st Qu.:   0  
##  Median :  1400   Median :843300   Median :244567   Median :1024  
##  Mean   : 13612   Mean   :752322   Mean   :259280   Mean   :1117  
##  3rd Qu.:  7900   3rd Qu.:843300   3rd Qu.:330980   3rd Qu.:2057  
##  Max.   :843300   Max.   :843300   Max.   :843300   Max.   :3613  
##                                                                   
##    kw_max_avg       kw_avg_avg    self_reference_min_shares
##  Min.   :     0   Min.   :    0   Min.   :     0           
##  1st Qu.:  3562   1st Qu.: 2382   1st Qu.:   639           
##  Median :  4356   Median : 2870   Median :  1200           
##  Mean   :  5657   Mean   : 3136   Mean   :  3999           
##  3rd Qu.:  6020   3rd Qu.: 3600   3rd Qu.:  2600           
##  Max.   :298400   Max.   :43568   Max.   :843300           
##                                                            
##  self_reference_max_shares self_reference_avg_sharess weekday_is_monday
##  Min.   :     0            Min.   :     0.0           Min.   :0.000    
##  1st Qu.:  1100            1st Qu.:   981.1           1st Qu.:0.000    
##  Median :  2800            Median :  2200.0           Median :0.000    
##  Mean   : 10330            Mean   :  6401.7           Mean   :0.168    
##  3rd Qu.:  8000            3rd Qu.:  5200.0           3rd Qu.:0.000    
##  Max.   :843300            Max.   :843300.0           Max.   :1.000    
##                                                                        
##  weekday_is_tuesday weekday_is_wednesday weekday_is_thursday
##  Min.   :0.0000     Min.   :0.0000       Min.   :0.0000     
##  1st Qu.:0.0000     1st Qu.:0.0000       1st Qu.:0.0000     
##  Median :0.0000     Median :0.0000       Median :0.0000     
##  Mean   :0.1864     Mean   :0.1875       Mean   :0.1833     
##  3rd Qu.:0.0000     3rd Qu.:0.0000       3rd Qu.:0.0000     
##  Max.   :1.0000     Max.   :1.0000       Max.   :1.0000     
##                                                             
##  weekday_is_friday weekday_is_saturday weekday_is_sunday   is_weekend    
##  Min.   :0.0000    Min.   :0.00000     Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000    1st Qu.:0.00000     1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000    Median :0.00000     Median :0.00000   Median :0.0000  
##  Mean   :0.1438    Mean   :0.06188     Mean   :0.06904   Mean   :0.1309  
##  3rd Qu.:0.0000    3rd Qu.:0.00000     3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000    Max.   :1.00000     Max.   :1.00000   Max.   :1.0000  
##                                                                          
##      LDA_00           LDA_01           LDA_02           LDA_03     
##  Min.   :0.0200   Min.   :0.0200   Min.   :0.0200   Min.   :0.020  
##  1st Qu.:0.0300   1st Qu.:0.0300   1st Qu.:0.0300   1st Qu.:0.030  
##  Median :0.0300   Median :0.0300   Median :0.0400   Median :0.040  
##  Mean   :0.1847   Mean   :0.1413   Mean   :0.2165   Mean   :0.224  
##  3rd Qu.:0.2400   3rd Qu.:0.1500   3rd Qu.:0.3300   3rd Qu.:0.380  
##  Max.   :0.9300   Max.   :0.9300   Max.   :0.9200   Max.   :0.930  
##                                                                    
##      LDA_04       global_subjectivity global_sentiment_polarity
##  Min.   :0.0200   Min.   :0.0000      Min.   :-0.3900          
##  1st Qu.:0.0300   1st Qu.:0.4000      1st Qu.: 0.0600          
##  Median :0.0400   Median :0.4500      Median : 0.1200          
##  Mean   :0.2341   Mean   :0.4434      Mean   : 0.1193          
##  3rd Qu.:0.4000   3rd Qu.:0.5100      3rd Qu.: 0.1800          
##  Max.   :0.9300   Max.   :1.0000      Max.   : 0.7300          
##                                                                
##  global_rate_positive_words global_rate_negative_words rate_positive_words
##  Min.   :0.00000            Min.   :0.00000            Min.   :0.0000     
##  1st Qu.:0.03000            1st Qu.:0.01000            1st Qu.:0.6000     
##  Median :0.04000            Median :0.02000            Median :0.7100     
##  Mean   :0.03965            Mean   :0.01664            Mean   :0.6825     
##  3rd Qu.:0.05000            3rd Qu.:0.02000            3rd Qu.:0.8000     
##  Max.   :0.16000            Max.   :0.18000            Max.   :1.0000     
##                                                                           
##  rate_negative_words avg_positive_polarity min_positive_polarity
##  Min.   :0.000       Min.   :0.0000        Min.   :0.00000      
##  1st Qu.:0.190       1st Qu.:0.3100        1st Qu.:0.05000      
##  Median :0.280       Median :0.3600        Median :0.10000      
##  Mean   :0.288       Mean   :0.3539        Mean   :0.09506      
##  3rd Qu.:0.380       3rd Qu.:0.4100        3rd Qu.:0.10000      
##  Max.   :1.000       Max.   :1.0000        Max.   :1.00000      
##                                                                 
##  max_positive_polarity avg_negative_polarity min_negative_polarity
##  Min.   :0.0000        Min.   :-1.0000       Min.   :-1.0000      
##  1st Qu.:0.6000        1st Qu.:-0.3300       1st Qu.:-0.7000      
##  Median :0.8000        Median :-0.2500       Median :-0.5000      
##  Mean   :0.7568        Mean   :-0.2598       Mean   :-0.5222      
##  3rd Qu.:1.0000        3rd Qu.:-0.1900       3rd Qu.:-0.3000      
##  Max.   :1.0000        Max.   : 0.0000       Max.   : 0.0000      
##                                                                   
##  max_negative_polarity title_subjectivity title_sentiment_polarity
##  Min.   :-1.0000       Min.   :0.0000     Min.   :-1.00000        
##  1st Qu.:-0.1300       1st Qu.:0.0000     1st Qu.: 0.00000        
##  Median :-0.1000       Median :0.1500     Median : 0.00000        
##  Mean   :-0.1085       Mean   :0.2824     Mean   : 0.07154        
##  3rd Qu.:-0.0500       3rd Qu.:0.5000     3rd Qu.: 0.15000        
##  Max.   : 0.0000       Max.   :1.0000     Max.   : 1.00000        
##                                                                   
##  abs_title_subjectivity abs_title_sentiment_polarity     shares      
##  Min.   :0.0000         Min.   :0.0000               Min.   :     1  
##  1st Qu.:0.1700         1st Qu.:0.0000               1st Qu.:   946  
##  Median :0.5000         Median :0.0000               Median :  1400  
##  Mean   :0.3421         Mean   :0.1564               Mean   :  3395  
##  3rd Qu.:0.5000         3rd Qu.:0.2500               3rd Qu.:  2800  
##  Max.   :0.5000         Max.   :1.0000               Max.   :843300  
## 

Delete url and timedelta columns

newsreg <- subset( news, select = -c(url, timedelta ) )

Standardize data

# generate z-scores using the scale() function
for(i in ncol(newsreg)-1){ 
  newsreg[,i]<-scale(newsreg[,i], center = TRUE, scale = TRUE)
}

Define articles with shares larger than 1400 (median) as popular article

# Dataset for classification
newscla <-newsreg
newscla$shares <- as.factor(ifelse(newscla$shares > 1400,1,0))

Split Data

Train - 70%; Test - 30%

#set random situation
set.seed(100)
# Select traning data and prediction data
ind<-sample(2,nrow(newscla),replace=TRUE,prob=c(0.7,0.3))

Color Palatte

color.lr<-'#efab69'
color.knn<-'#ef696a'
color.cart<-'#ab69ef'
color.c50<-'#69adef'
color.rf<-'#adef69'

KNN

Note that the type="class","prob" parameter is used to create different format of prediction data to fit into further analysis.

newscla.knn <- knn3(shares ~.,newscla[ind==1,])
newscla.knn.pred <- predict( newscla.knn,newscla[ind==2,],type="class")
newscla.knn.prob <- predict( newscla.knn,newscla[ind==2,],type="prob")

# Confusion matrix
confusionMatrix(newscla.knn.pred, newscla[ind==2,]$shares)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3553 2672
##          1 2458 3067
##                                           
##                Accuracy : 0.5634          
##                  95% CI : (0.5544, 0.5724)
##     No Information Rate : 0.5116          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.1256          
##  Mcnemar's Test P-Value : 0.002941        
##                                           
##             Sensitivity : 0.5911          
##             Specificity : 0.5344          
##          Pos Pred Value : 0.5708          
##          Neg Pred Value : 0.5551          
##              Prevalence : 0.5116          
##          Detection Rate : 0.3024          
##    Detection Prevalence : 0.5298          
##       Balanced Accuracy : 0.5627          
##                                           
##        'Positive' Class : 0               
## 
# ROC Curve
newscla.knn.roc <- roc(newscla[ind==2,]$shares,newscla.knn.prob[,2])
plot(newscla.knn.roc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
     grid.col=c("green", "red"), max.auc.polygon=TRUE,
     auc.polygon.col=color.knn, print.thres=TRUE)

## 
## Call:
## roc.default(response = newscla[ind == 2, ]$shares, predictor = newscla.knn.prob[,     2])
## 
## Data: newscla.knn.prob[, 2] in 6011 controls (newscla[ind == 2, ]$shares 0) < 5739 cases (newscla[ind == 2, ]$shares 1).
## Area under the curve: 0.5887

CART (Classification and Regression Trees)

newscla.cart<-rpart(shares ~.,newscla[ind==1,],method='class')

# Plot tree
####plot(newscla.cart)
####text(newscla.cart)
####prp(newscla.cart)
fancyRpartPlot(newscla.cart) # the most beautiful one

"Summary of CART"
## [1] "Summary of CART"
summary(newscla.cart)
## Call:
## rpart(formula = shares ~ ., data = newscla[ind == 1, ], method = "class")
##   n= 27893 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.18687599      0 1.0000000 1.0000000 0.006041286
## 2 0.02749240      1 0.8131240 0.8144263 0.005928124
## 3 0.01490378      2 0.7856316 0.7870062 0.005893483
## 4 0.01454203      3 0.7707278 0.7807843 0.005884956
## 5 0.01000000      4 0.7561858 0.7569093 0.005849921
## 
## Variable importance
##                    kw_avg_avg                    kw_max_avg 
##                            23                            15 
##                    kw_min_avg                    kw_min_max 
##                            10                             9 
##                        LDA_03 data_channel_is_entertainment 
##                             9                             8 
##                    kw_avg_max          data_channel_is_tech 
##                             8                             6 
##        data_channel_is_socmed                        LDA_04 
##                             5                             4 
##                        LDA_01 
##                             2 
## 
## Node number 1: 27893 observations,    complexity param=0.186876
##   predicted class=0  expected loss=0.4955365  P(node) =1
##     class counts: 14071 13822
##    probabilities: 0.504 0.496 
##   left son=2 (14200 obs) right son=3 (13693 obs)
##   Primary splits:
##       kw_avg_avg                 < 2887.855 to the left,  improve=524.9149, (0 missing)
##       kw_max_avg                 < 3749.93  to the left,  improve=447.7867, (0 missing)
##       self_reference_min_shares  < 1650     to the left,  improve=362.2783, (0 missing)
##       self_reference_avg_sharess < 2796.91  to the left,  improve=356.8588, (0 missing)
##       LDA_02                     < 0.595    to the right, improve=322.3864, (0 missing)
##   Surrogate splits:
##       kw_max_avg < 4333.405 to the left,  agree=0.823, adj=0.639, (0 split)
##       kw_min_avg < 1685.56  to the left,  agree=0.721, adj=0.432, (0 split)
##       kw_min_max < 2950     to the left,  agree=0.693, adj=0.374, (0 split)
##       LDA_03     < 0.06     to the left,  agree=0.689, adj=0.367, (0 split)
##       kw_avg_max < 283366.9 to the left,  agree=0.676, adj=0.341, (0 split)
## 
## Node number 2: 14200 observations,    complexity param=0.01490378
##   predicted class=0  expected loss=0.4002817  P(node) =0.5090883
##     class counts:  8516  5684
##    probabilities: 0.600 0.400 
##   left son=4 (10962 obs) right son=5 (3238 obs)
##   Primary splits:
##       data_channel_is_tech       < 0.5      to the left,  improve=145.1250, (0 missing)
##       is_weekend                 < 0.5      to the left,  improve=126.1450, (0 missing)
##       self_reference_avg_sharess < 1889.665 to the left,  improve=125.7891, (0 missing)
##       LDA_04                     < 0.315    to the left,  improve=120.2984, (0 missing)
##       data_channel_is_world      < 0.5      to the right, improve=116.4967, (0 missing)
##   Surrogate splits:
##       LDA_04                     < 0.515    to the left,  agree=0.896, adj=0.545, (0 split)
##       num_self_hrefs             < 13.5     to the left,  agree=0.775, adj=0.014, (0 split)
##       self_reference_max_shares  < 458150   to the left,  agree=0.773, adj=0.002, (0 split)
##       self_reference_avg_sharess < 1e+05    to the left,  agree=0.773, adj=0.002, (0 split)
##       kw_avg_max                 < 561785   to the left,  agree=0.772, adj=0.001, (0 split)
## 
## Node number 3: 13693 observations,    complexity param=0.0274924
##   predicted class=1  expected loss=0.4056817  P(node) =0.4909117
##     class counts:  5555  8138
##    probabilities: 0.406 0.594 
##   left son=6 (2726 obs) right son=7 (10967 obs)
##   Primary splits:
##       data_channel_is_entertainment < 0.5      to the right, improve=183.12460, (0 missing)
##       self_reference_min_shares     < 1550     to the left,  improve=123.63820, (0 missing)
##       self_reference_avg_sharess    < 2887.385 to the left,  improve= 98.91209, (0 missing)
##       is_weekend                    < 0.5      to the left,  improve= 94.58317, (0 missing)
##       self_reference_max_shares     < 2950     to the left,  improve= 73.29898, (0 missing)
##   Surrogate splits:
##       LDA_01     < 0.485    to the right, agree=0.856, adj=0.276, (0 split)
##       num_videos < 21.5     to the right, agree=0.805, adj=0.018, (0 split)
##       num_imgs   < 48.5     to the right, agree=0.803, adj=0.011, (0 split)
##       num_hrefs  < 178.5    to the right, agree=0.801, adj=0.001, (0 split)
## 
## Node number 4: 10962 observations,    complexity param=0.01454203
##   predicted class=0  expected loss=0.3614304  P(node) =0.3930018
##     class counts:  7000  3962
##    probabilities: 0.639 0.361 
##   left son=8 (10399 obs) right son=9 (563 obs)
##   Primary splits:
##       data_channel_is_socmed    < 0.5      to the left,  improve=119.33500, (0 missing)
##       kw_avg_max                < 143976.2 to the right, improve=101.73670, (0 missing)
##       kw_max_avg                < 3645.125 to the left,  improve= 94.70809, (0 missing)
##       kw_max_max                < 654150   to the right, improve= 89.97251, (0 missing)
##       self_reference_min_shares < 1650     to the left,  improve= 85.74340, (0 missing)
##   Surrogate splits:
##       num_keywords   < 2.5      to the right, agree=0.949, adj=0.011, (0 split)
##       num_self_hrefs < 38.5     to the left,  agree=0.949, adj=0.009, (0 split)
## 
## Node number 5: 3238 observations
##   predicted class=1  expected loss=0.4681902  P(node) =0.1160865
##     class counts:  1516  1722
##    probabilities: 0.468 0.532 
## 
## Node number 6: 2726 observations
##   predicted class=0  expected loss=0.4303008  P(node) =0.09773061
##     class counts:  1553  1173
##    probabilities: 0.570 0.430 
## 
## Node number 7: 10967 observations
##   predicted class=1  expected loss=0.3649129  P(node) =0.3931811
##     class counts:  4002  6965
##    probabilities: 0.365 0.635 
## 
## Node number 8: 10399 observations
##   predicted class=0  expected loss=0.3442639  P(node) =0.3728176
##     class counts:  6819  3580
##    probabilities: 0.656 0.344 
## 
## Node number 9: 563 observations
##   predicted class=1  expected loss=0.321492  P(node) =0.02018428
##     class counts:   181   382
##    probabilities: 0.321 0.679
#predict
newscla.cart.pred<-predict( newscla.cart,newscla[ind==2,] ,type="class")
newscla.cart.prob<-predict( newscla.cart,newscla[ind==2,] ,type="prob")
# Confusion matrix
confusionMatrix(newscla.cart.pred, newscla[ind==2,]$shares)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3480 1943
##          1 2531 3796
##                                          
##                Accuracy : 0.6192         
##                  95% CI : (0.6104, 0.628)
##     No Information Rate : 0.5116         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.2398         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.5789         
##             Specificity : 0.6614         
##          Pos Pred Value : 0.6417         
##          Neg Pred Value : 0.6000         
##              Prevalence : 0.5116         
##          Detection Rate : 0.2962         
##    Detection Prevalence : 0.4615         
##       Balanced Accuracy : 0.6202         
##                                          
##        'Positive' Class : 0              
## 
# ROC Curve
newscla.cart.roc <- roc(newscla[ind==2,]$shares,newscla.cart.prob[,2])
plot(newscla.cart.roc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
     grid.col=c("green", "red"), max.auc.polygon=TRUE,
     auc.polygon.col=color.cart, print.thres=TRUE)