探索多個(gè)變量

getwd()
list.files()
pf <- read.csv('pseudo_facebook.tsv',sep='\t')

Third Qualitative Variable

在以性別為分類的年齡箱線圖中,加入每個(gè)性別的平均年齡
原箱線圖:

ggplot(aes(x = gender, y = age),
       data = subset(pf, !is.na(gender))) + geom_boxplot()

添加后的箱線圖:

ggplot(aes(x = gender, y = age),
       data = subset(pf, !is.na(gender))) + geom_boxplot()+
  stat_summary(fun.y=mean,geom='point',shape=4)

年齡、朋友數(shù)、性別三個(gè)變量:

ggplot(aes(x=age,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_line(aes(color=gender),stat='summary',fun.y=median)

按年齡和性別對(duì)數(shù)據(jù)進(jìn)行分組,并計(jì)算每個(gè)組里的平均好友數(shù),中位數(shù)好友數(shù)和每個(gè)組的數(shù)據(jù)條目數(shù)

detach("package:plyr", unload=TRUE)
library(dplyr)
pf.fc_by_age_gender <- pf %>%
  filter(!is.na(gender))%>%
  group_by(age,gender)%>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median=median(as.numeric(friend_count)),
            n=n())%>%
  ungroup()%>%
  arrange(age)

head(pf.fc_by_age_gender)

用上面的分組創(chuàng)建圖表

ggplot(aes(x=age,y=friend_count_median),data=pf.fc_by_age_gender)+
  geom_line(aes(color=gender))

Thinking in Ratios

女性用戶的好友數(shù)是男性用戶好友數(shù)的幾倍?
要回答這個(gè)問題,先重塑我們的數(shù)據(jù)
pf.fc_by_age_gender是長格式數(shù)據(jù),我們要把它轉(zhuǎn)化成寬格式數(shù)據(jù),
每一行包括:
年齡
對(duì)應(yīng)該年齡的男性用戶的好友數(shù)(中位數(shù))
對(duì)應(yīng)該年齡的女性用戶的好友數(shù)(中位數(shù))

library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
                                  age~gender,
                                  value.var = 'friend_count_median')
head(pf.fc_by_age_gender.wide)

函數(shù)dcast()中的d表示輸出的數(shù)據(jù)結(jié)構(gòu)為dataframe
如果要輸出矩陣或者數(shù)組,應(yīng)使用acast()


Ratio Plot

橫軸:年齡
縱軸:男/女好友數(shù)的中位數(shù)

ggplot(aes(x=age,y=female/male),data=pf.fc_by_age_gender.wide)+
  geom_line()+
  geom_hline(yintercept = 1,alpha=0.3,linetype=2)

探索四個(gè)變量:年齡,性別,好友數(shù),使用時(shí)長tenure

以2014為基準(zhǔn)年,添加[加入時(shí)間]這個(gè)變量

pf$year_joined <- floor(2014-pf$tenure/365)

floor為向下取整,返回不大于該數(shù)字的最大整數(shù)

Cut a Variable

切割變量year_joined,分為以下幾組:
2004-2009,2009-2011,2011-2012,2012-2014

summary(pf$year_joined)
table(pf$year_joined)
pf$year_joined.buckets <- cut(pf$year_joined,
                              c(2004,2009,2011,2012,2014))
table(pf$year_joined.buckets,useNA='ifany')

use variable year_joined.buckets to create a line graph

ggplot(aes(x=age,y=friend_count),
       data=subset(pf,!is.na(year_joined.buckets)))+
  geom_line(aes(color=year_joined.buckets),
            stat='summary',
            fun.y='median')

the parameter linetype can take the values 0-6:
0 = blank,
1 = solid,
2 = dashed
3 = dotted
4 = dotdash
5 = longdash
6 = twodash


Plot the Grand Mean

ggplot(aes(x=age,y=friend_count),
       data=subset(pf,!is.na(year_joined.buckets)))+
  geom_line(aes(color=year_joined.buckets),
            stat='summary',
            fun.y=mean)+
  geom_line(stat='summary',fun.y=mean,linetype=2)

Friending Rate

with(subset(pf,tenure>1),summary(friend_count/tenure))

Friendships Initiated

ggplot(aes(x=tenure,y=friendships_initiated/tenure),
       data=subset(pf,tenure>=1))+
  geom_line(stat='summary',aes(color=year_joined.buckets),fun.y=mean)

偏差-方差權(quán)衡


ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.buckets),
            stat = 'summary',
            fun.y = mean)

ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.buckets),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.buckets),
            stat = "summary",
            fun.y = mean)

ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.buckets),
            stat = "summary",
            fun.y = mean)
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_smooth(aes(color = year_joined.buckets))


the Yogurt Data Set

getwd()
yo <- read.csv("yogurt.csv")
View(yo)
yo$id <- factor(yo$id)
str(yo)

酸奶價(jià)格直方圖

ggplot(aes(x=price),data=yo)+
  geom_histogram(fill=I('#FF6374'))

不同的酸奶價(jià)格

unique(yo$price)
length(unique(yo$price))
table(yo$price)

將一條購買記錄中不同口味的酸奶數(shù)量加總,匯總成新變量all.purchases

names(yo)
yo <- transform(yo,all_purchases=strawberry+
                  blueberry+
                  pina.colada+
                  plain+mixed.berry)

all.purchases histogram

ggplot(aes(x=all_purchases),data=yo)+
  geom_histogram(binwidth=1)

隨時(shí)間變化的價(jià)格

ggplot(aes(x=time,y=price),data=yo)+
  geom_jitter(alpha=1/10,shape=21,fill=I('#F79420'))
  

Sampling Observations

對(duì)于酸奶數(shù)據(jù)集,我們可能需要更詳細(xì)地調(diào)查小樣本的家庭


Looking at Samples of Households

set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
sample.ids

ggplot(aes(x=time,y=price),
       data=subset(yo,id %in% sample.ids))+
  facet_wrap(~id)+
  geom_line()+
  geom_point(aes(size=all_purchases),pch=1)
    

Scatterplot Matrix 散點(diǎn)圖矩陣

library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[,c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])

set.seed確保得到可重復(fù)的結(jié)果


Even More Variables

nci <- read.table("nci.tsv")
colnames(nci)
colnames <-c(1:64)

Heat Maps

library(reshape2)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)

ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))

習(xí)題集

1.帶有分面和顏色的價(jià)格直方圖
scale_fill_brewer(type = 'qual')可以修改顏色的編碼方式

data(diamonds)
View(diamonds)
ggplot(aes(x = price,fill=cut),
       data = diamonds) + 
  geom_histogram(bins=35) +
  facet_wrap(~ color) +
  scale_x_log10() +
  scale_fill_brewer(type = 'qual')

2.價(jià)格與按切工填色的表格

names(diamonds)
p1 <- ggplot(aes(x=table,y=price),data=diamonds)+
  geom_point(aes(color=cut))+
  scale_color_brewer(type = 'qual')+
  scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))

p2 <- ggplot(aes(x=table,y=price,fill=cut),data=diamonds)+
  geom_point(aes(color=cut))+
  scale_color_brewer(type = 'qual')+
  scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))

library(gridExtra)
grid.arrange(p1,p2)

3.價(jià)格與體積和鉆石凈度

diamonds$v = diamonds$x*diamonds$y*diamonds$z
ggplot(aes(x=v,y=price,fill=clarity),
       data=diamonds)+
  xlim(0,quantile(diamonds$v,0.99))+
  scale_y_log10()+
  geom_point(aes(color=clarity))+
  scale_color_brewer(type = 'div')

4.新建友誼的比例

pf$prop_initiated <- pf$friendships_initiated/pf$friend_count

5.prop_initiated 與使用時(shí)長

pf$year_joined <- floor(2014-pf$tenure/365)
pf$year_joined.buckets <- cut(pf$year_joined,
                              c(2004,2009,2011,2012,2014))

ggplot(aes(x=tenure,y=prop_initiated),
       data=na.omit(pf))+
  geom_line(aes(color=year_joined.buckets),
            stat='summary',
            fun.y=median)

ggplot(pf, aes(x=tenure, 
               y=prop_initiated, 
               color=year_joined.buckets)) +
  geom_line(stat='summary', fun.y=median, na.rm=TRUE)

ggplot(pf, aes(x=tenure, 
               y=prop_initiated, 
               color=year_joined.buckets)) +
  geom_line(stat='summary', fun.y=median, na.rm=TRUE)+
  geom_smooth()

6.最大的組均值 prop_initiated

with(pf,year_joined.buckets=)

with(subset(pf,!is.na(prop_initiated)&year_joined.buckets=='(2012,2014]'),
     mean(prop_initiated))
by(pf$prop_initiated,pf$year_joined.buckets,summary)

7.經(jīng)過分組、分面和填色的價(jià)格/克拉

ggplot(aes(x=cut,y=price/carat),data=diamonds)+
  geom_jitter(aes(color=color))+
  facet_wrap(~clarity)+
  scale_color_brewer(type = 'div')
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容