getwd()
list.files()
pf <- read.csv('pseudo_facebook.tsv',sep='\t')
Third Qualitative Variable
在以性別為分類的年齡箱線圖中,加入每個(gè)性別的平均年齡
原箱線圖:
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()
添加后的箱線圖:
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()+
stat_summary(fun.y=mean,geom='point',shape=4)
年齡、朋友數(shù)、性別三個(gè)變量:
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(gender)))+
geom_line(aes(color=gender),stat='summary',fun.y=median)
按年齡和性別對(duì)數(shù)據(jù)進(jìn)行分組,并計(jì)算每個(gè)組里的平均好友數(shù),中位數(shù)好友數(shù)和每個(gè)組的數(shù)據(jù)條目數(shù)
detach("package:plyr", unload=TRUE)
library(dplyr)
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender))%>%
group_by(age,gender)%>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median=median(as.numeric(friend_count)),
n=n())%>%
ungroup()%>%
arrange(age)
head(pf.fc_by_age_gender)
用上面的分組創(chuàng)建圖表
ggplot(aes(x=age,y=friend_count_median),data=pf.fc_by_age_gender)+
geom_line(aes(color=gender))
Thinking in Ratios
女性用戶的好友數(shù)是男性用戶好友數(shù)的幾倍?
要回答這個(gè)問題,先重塑我們的數(shù)據(jù)
pf.fc_by_age_gender是長格式數(shù)據(jù),我們要把它轉(zhuǎn)化成寬格式數(shù)據(jù),
每一行包括:
年齡
對(duì)應(yīng)該年齡的男性用戶的好友數(shù)(中位數(shù))
對(duì)應(yīng)該年齡的女性用戶的好友數(shù)(中位數(shù))
library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age~gender,
value.var = 'friend_count_median')
head(pf.fc_by_age_gender.wide)
函數(shù)dcast()中的d表示輸出的數(shù)據(jù)結(jié)構(gòu)為dataframe
如果要輸出矩陣或者數(shù)組,應(yīng)使用acast()
Ratio Plot
橫軸:年齡
縱軸:男/女好友數(shù)的中位數(shù)
ggplot(aes(x=age,y=female/male),data=pf.fc_by_age_gender.wide)+
geom_line()+
geom_hline(yintercept = 1,alpha=0.3,linetype=2)
探索四個(gè)變量:年齡,性別,好友數(shù),使用時(shí)長tenure
以2014為基準(zhǔn)年,添加[加入時(shí)間]這個(gè)變量
pf$year_joined <- floor(2014-pf$tenure/365)
floor為向下取整,返回不大于該數(shù)字的最大整數(shù)
Cut a Variable
切割變量year_joined,分為以下幾組:
2004-2009,2009-2011,2011-2012,2012-2014
summary(pf$year_joined)
table(pf$year_joined)
pf$year_joined.buckets <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
table(pf$year_joined.buckets,useNA='ifany')
use variable year_joined.buckets to create a line graph
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(year_joined.buckets)))+
geom_line(aes(color=year_joined.buckets),
stat='summary',
fun.y='median')
the parameter linetype can take the values 0-6:
0 = blank,
1 = solid,
2 = dashed
3 = dotted
4 = dotdash
5 = longdash
6 = twodash
Plot the Grand Mean
ggplot(aes(x=age,y=friend_count),
data=subset(pf,!is.na(year_joined.buckets)))+
geom_line(aes(color=year_joined.buckets),
stat='summary',
fun.y=mean)+
geom_line(stat='summary',fun.y=mean,linetype=2)
Friending Rate
with(subset(pf,tenure>1),summary(friend_count/tenure))
Friendships Initiated
ggplot(aes(x=tenure,y=friendships_initiated/tenure),
data=subset(pf,tenure>=1))+
geom_line(stat='summary',aes(color=year_joined.buckets),fun.y=mean)
偏差-方差權(quán)衡
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.buckets),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.buckets),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.buckets),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.buckets),
stat = "summary",
fun.y = mean)
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_smooth(aes(color = year_joined.buckets))
the Yogurt Data Set
getwd()
yo <- read.csv("yogurt.csv")
View(yo)
yo$id <- factor(yo$id)
str(yo)
酸奶價(jià)格直方圖
ggplot(aes(x=price),data=yo)+
geom_histogram(fill=I('#FF6374'))
不同的酸奶價(jià)格
unique(yo$price)
length(unique(yo$price))
table(yo$price)
將一條購買記錄中不同口味的酸奶數(shù)量加總,匯總成新變量all.purchases
names(yo)
yo <- transform(yo,all_purchases=strawberry+
blueberry+
pina.colada+
plain+mixed.berry)
all.purchases histogram
ggplot(aes(x=all_purchases),data=yo)+
geom_histogram(binwidth=1)
隨時(shí)間變化的價(jià)格
ggplot(aes(x=time,y=price),data=yo)+
geom_jitter(alpha=1/10,shape=21,fill=I('#F79420'))
Sampling Observations
對(duì)于酸奶數(shù)據(jù)集,我們可能需要更詳細(xì)地調(diào)查小樣本的家庭
Looking at Samples of Households
set.seed(4230)
sample.ids <- sample(levels(yo$id),16)
sample.ids
ggplot(aes(x=time,y=price),
data=subset(yo,id %in% sample.ids))+
facet_wrap(~id)+
geom_line()+
geom_point(aes(size=all_purchases),pch=1)
Scatterplot Matrix 散點(diǎn)圖矩陣
library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset <- pf[,c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
set.seed確保得到可重復(fù)的結(jié)果
Even More Variables
nci <- read.table("nci.tsv")
colnames(nci)
colnames <-c(1:64)
Heat Maps
library(reshape2)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
習(xí)題集
1.帶有分面和顏色的價(jià)格直方圖
scale_fill_brewer(type = 'qual')可以修改顏色的編碼方式
data(diamonds)
View(diamonds)
ggplot(aes(x = price,fill=cut),
data = diamonds) +
geom_histogram(bins=35) +
facet_wrap(~ color) +
scale_x_log10() +
scale_fill_brewer(type = 'qual')
2.價(jià)格與按切工填色的表格
names(diamonds)
p1 <- ggplot(aes(x=table,y=price),data=diamonds)+
geom_point(aes(color=cut))+
scale_color_brewer(type = 'qual')+
scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))
p2 <- ggplot(aes(x=table,y=price,fill=cut),data=diamonds)+
geom_point(aes(color=cut))+
scale_color_brewer(type = 'qual')+
scale_x_continuous(breaks=seq(50,80,2),lim=c(50,80))
library(gridExtra)
grid.arrange(p1,p2)
3.價(jià)格與體積和鉆石凈度
diamonds$v = diamonds$x*diamonds$y*diamonds$z
ggplot(aes(x=v,y=price,fill=clarity),
data=diamonds)+
xlim(0,quantile(diamonds$v,0.99))+
scale_y_log10()+
geom_point(aes(color=clarity))+
scale_color_brewer(type = 'div')
4.新建友誼的比例
pf$prop_initiated <- pf$friendships_initiated/pf$friend_count
5.prop_initiated 與使用時(shí)長
pf$year_joined <- floor(2014-pf$tenure/365)
pf$year_joined.buckets <- cut(pf$year_joined,
c(2004,2009,2011,2012,2014))
ggplot(aes(x=tenure,y=prop_initiated),
data=na.omit(pf))+
geom_line(aes(color=year_joined.buckets),
stat='summary',
fun.y=median)
ggplot(pf, aes(x=tenure,
y=prop_initiated,
color=year_joined.buckets)) +
geom_line(stat='summary', fun.y=median, na.rm=TRUE)
ggplot(pf, aes(x=tenure,
y=prop_initiated,
color=year_joined.buckets)) +
geom_line(stat='summary', fun.y=median, na.rm=TRUE)+
geom_smooth()
6.最大的組均值 prop_initiated
with(pf,year_joined.buckets=)
with(subset(pf,!is.na(prop_initiated)&year_joined.buckets=='(2012,2014]'),
mean(prop_initiated))
by(pf$prop_initiated,pf$year_joined.buckets,summary)
7.經(jīng)過分組、分面和填色的價(jià)格/克拉
ggplot(aes(x=cut,y=price/carat),data=diamonds)+
geom_jitter(aes(color=color))+
facet_wrap(~clarity)+
scale_color_brewer(type = 'div')