第 2 章 数据及其模式
- 章节名:第 2 章 数据及其模式
## --------------------------- System Setting Start --------------------------- ## 01. Choose Directory AND Set As Working Directory #path1 <- "D:/06-Training_SelfPromotion/002_00-Data_Analysis_Presentation/" #path2 <- "06-Statistics_WuXiZhi/Code_Data_Notes/" #path3 <- "Data/Chapter02/" #WorkingDirectory <- paste(path1, path2, path3, sep = "") WorkingDirectory <- choose.dir() setwd(WorkingDirectory) ## FilePath <- file.choose(WorkingDirectory) ## FilePath <- file.choose(getwd()) ## dir(getwd()) ## 02. loading R Script and input data source("MyRCode.R") #rawData <- read.csv(file.choose(WorkingDirectory), header = F, sep = ',') rawData <- read.csv(file.choose(getwd()), header = F, sep = ',') ## 03. Record Current Date Time DateTime <- format(Sys.time(), "%Y-%m-%d %H-%M-%S") ## --------------------------- System Setting Ended --------------------------- ## -------------------------- File Name Module Start -------------------------- # get the file name from the choosen file FilePath <- file.choose(WorkingDirectory) # [1] "F:\\JMP-DOE-Statistics\\Statistics-R\\Code_Data_Notes\\ # Chapter01-Code_Answer_Notes.r" # Split File Path 2 List By 2 Backslash \\ FilePathList <- strsplit(FilePath,"\\\\") # get the number of elements from list File Path List FilePathListLength <- length(FilePathList[[1]]) # get the last element that is the file name FileName <- FilePathList[[1]][FilePathListLength] sprintf("The File Name is: ") sprintf("%s", FileName) ## -------------------------- File Name Module Ended -------------------------- # e.g. 2.1 names.txt # 姓名 性别 教育 籍贯 年龄 观点 # 王芳 女 大学 北京 62 是 # 李泽娜 女 大学 山东 57 否 # 刘伟 男 中学 河北 19 是 # 刘东 男 大学 河北 29 是 # 李锐 男 中学 北京 15 否 # 张节福 男 研究生 北京 41 是 # 赵思雨 男 研究生 山东 48 否 # 唐慧聪 女 中学 山东 47 否 # 熊爱珊 女 大学 山东 23 是 # 王冰 男 大学 河北 23 否 w=read.table("Data/Chapter02/names.txt",header=T) # filter variable 1, 4, 5: 姓名 籍贯 年龄 v=w[,-c(1,4,5)] tt=table(v) # flat table ftable(tt) # 观点 否 是 # 性别 教育 # 男 大学 1 1 # 研究生 1 1 # 中学 1 1 # 女 大学 1 2 # 研究生 0 0 # 中学 1 0 ftable(tt,col.vars=c(1,3)) # equal statement to ftable(tt,col.vars=c(1,3)) ftable(tt,col.vars=c("性别","观点")) # 性别 男 女 # 观点 否 是 否 是 # 教育 # 大学 1 1 1 2 # 研究生 1 1 0 0 # 中学 1 1 1 0 ```{r } # 2 dimension flat table ftable(tt,col.vars=3,row.vars=2) # 观点 否 是 # 教育 # 大学 2 3 # 研究生 1 1 # 中学 2 1 ftable(tt, row.vars=3, col.vars=2) # 教育 大学 研究生 中学 # 观点 # 否 2 1 2 # 是 3 1 1 ``` # xtabs() cross table xtabs(~., v) # same as tt, same as table(v) # , , 观点 = 否 # # 教育 # 性别 大学 研究生 中学 # 男 1 1 1 # 女 1 0 1 # # , , 观点 = 是 # # 教育 # 性别 大学 研究生 中学 # 男 1 1 1 # 女 2 0 0 xtabs(~性别+观点, v) # 观点 # 性别 否 是 # 男 3 3 # 女 2 2 # e.g. 2.2 # read data w=read.csv("Data/Chapter02/Rich.csv",header=T) #print head 20 lines w[1:20, ] head(w, 20) # print 3x3 w[1:3, 1:3] # names(w), write muti-statement in one line names(w); # [1] "Rank" "Name" "Net.Worth" "Age" "Source" "Residency" summary(w);str(w) #sort by Residency and retrieve top 10 v=rev(sort(table(w[,6])))[1:10] #sort by Source and retrieve top 10 u=rev(sort(table(w[,5])))[1:10] # setting margin rep(0, 4) op <- par(mar = rep(0, 4)) #op # $mar # [1] 5.1 4.1 4.1 2.1 plot.new() par(op) #setting margin # side-by-side 1 Row X 2 Column par(mfrow=c(1,2)) # pie chart Top 10 by residency pie(v,cex.names=.8,main="by residency") # pie chart Top 10 by source pie(u,cex.names=.8,main="by source") # paralleling 2 Row X 1 Column par(mfrow=c(2,1)) barplot(v,cex.names=.8,main="by residency") barplot(u,cex.names=.8,main="by source") #set back to default 1 x 1 par(mfrow=c(1,1)) #e.g. 2.3 global top 2000 Company w=read.csv("Data/Chapter02/Forbes2000.csv",header=T) names(w);summary(w) # [1] "Rank" "Company" "Country" "Sales" # [5] "Profits" "Assets" "Market.Value" #draw 4 histogram, data do log transform par(mfrow=c(2,2)) for(i in 4:7){ hist(log(w[,i]),main=paste("log",names(w)[i]),xlab="") rug(log(w[,i])) } # box plot, box-and-whisker plot # draw china companies market value, horizontal positioned par(mfrow=c(1,1)) boxplot(w[w[,3]=="China",7],horizontal=T, main="market value") rug(w[w[,3]=="China",7]) stem(w[w[,3]=="China",7]) v=w[1:100,] plot(v$Assets,v$Sales,pch=1,col=1, xlim=c(-500,3000),ylim=c(0,600), cex=sqrt(v$Profits)) identify(v$Assets,v$Sales,labels=v$Company) C=w[w[,3]=="China",] G=w[w[,3]=="Germany",] par(mfrow=c(1,2)) hist(C$Market.Value,20,main="histogram of market value(China)", xlab="market value",ylab="density", col=3,prob=T,ylim=c(0,0.07)) lines(density(C$Market.Value),lwd=2) hist(G$Market.Value,20,main="histogram of market value(Germany)", xlab="market value",ylab="density", col=3,prob=T,ylim=c(0,0.07)) lines(density(G$Market.Value),lwd=2) par(mfrow=c(1,1)) w=scan("soi.txt") w=ts(w,start=1950,frequency=12) plot(w,ylab="SOI") abline(h=0,lty=2) title("the southern oscillation index 1950-1995") w=read.table("Data/Chapter02/USIP.txt",header=T) v=ts(w[,-c(1,2)],start=c(1947,1),frequency=12) ts.plot(v,lty=1:8,col=1:8,ylab="indices",xlab="time")#若用plot,得出8个图而不是放在一起的一个图 title("US indices of industrial production") legend("topleft",legend=names(w)[3:10],lty=1:8,col=1:8) w=read.csv("Data/Chapter02/Salinity.csv",header=T) attach(w) plot(Waterflow,Salinity) title=("Salinity") identify(Waterflow,Salinity,labels=rownames(w)) detach(w) x=scan("Data/Chapter02/income.txt")#若用read.table函数,则x是数据框,不能直接求平均,需将x向量化 mean(x) median(x) w=read.table("Data/Chapter02/Acorn.txt",sep=",",header=T)#数据是一逗号隔开的 mean(w[,4]);median(w[,4]) hist(w[,4],prob=T, xlab="Acorn size",main="Acorn size") rug(w[,4]) arrows(5,0.2,median(w[,4]),0) arrows(10,0.2,mean(w[,4]),0) text(locator(2),c("meadian=1.8","mean=3.34")) C=w[w[,2]=="California",4] A=w[w[,2]=="Atlantic",4] summary(C);summary(A) fivenum(C);fivenum(A) boxplot(Acorn_size~Region,w,horizontal=T) par(mfrow=c(1,2)) hist(C,12,prob=T, xlab="Acorn size",main="California", xlim=c(0,18),ylim=c(0,0.5)) rug(C);lines(density(C)) hist(A,12,prob=T, xlab="Acorn size",main="Atlantic", xlim=c(0,18),ylim=c(0,0.5)) rug(A);lines(density(A)) par(mfrow=c(1,1)) #习题 #1 w=read.csv("Data/Chapter02/Old.csv",header=T) dim(w) nrow(w)-nrow(na.omit(w)) names(w) hist(w[,5]);boxplot(w[,5],horizontal=T) table(w[,5]) barplot(table(w[,5])) pie(table(w[,5])) stem(w[,5]) v=na.omit(w) v[v[,5]>=15,1] v[order(v[,5],decreasing=T),] v[v[,1]=="China",] median(v[,5]) mean(v[,5]) #3 w=read.table("Data/Chapter02/chismoke.dat",header=T) names(w) x0=xtabs(Count~.,w);x0;dim(x0) attributes(x0) x1=xtabs(Count~.,w[,2:4]);x1 x2=xtabs(Count~.,w[,-2]);x2 #实验数据 #4 (164-157)/qnorm(0.9) pnorm(175,167.8,5.61819) 1-pnorm(226,167.8,5.61819) 1-pnorm(160,167.8,5.61819) pnorm(162,157,5.462129) 1-pnorm(180,167.8,5.61819);1-pnorm(175,157,5.462129)
36人阅读
说明 · · · · · ·
表示其中内容是对原文的摘抄