第 2 章数据及其模式

侯祥胡 (思行勤业)

章节名：第 2 章数据及其模式
2015-01-11 21:07:26
## --------------------------- System Setting Start ---------------------------
## 01. Choose Directory AND Set As Working Directory
#path1 <- "D:/06-Training_SelfPromotion/002_00-Data_Analysis_Presentation/"
#path2 <- "06-Statistics_WuXiZhi/Code_Data_Notes/"
#path3 <- "Data/Chapter02/"
#WorkingDirectory <- paste(path1, path2, path3, sep = "")
WorkingDirectory <- choose.dir()
setwd(WorkingDirectory)
## FilePath <- file.choose(WorkingDirectory)
## FilePath <- file.choose(getwd())
## dir(getwd())
## 02. loading R Script and input data
source("MyRCode.R")
#rawData <- read.csv(file.choose(WorkingDirectory), header = F, sep = ',')
rawData <- read.csv(file.choose(getwd()), header = F, sep = ',')
## 03. Record Current Date Time
DateTime <- format(Sys.time(), "%Y-%m-%d %H-%M-%S")
## --------------------------- System Setting Ended ---------------------------

## -------------------------- File Name Module Start --------------------------
# get the file name from the choosen file
FilePath <- file.choose(WorkingDirectory)
# [1] "F:\\JMP-DOE-Statistics\\Statistics-R\\Code_Data_Notes\\
# Chapter01-Code_Answer_Notes.r"
# Split File Path 2 List By 2 Backslash \\
FilePathList <- strsplit(FilePath,"\\\\")
# get the number of elements from list File Path List
FilePathListLength <- length(FilePathList[[1]])
# get the last element that is the file name
FileName <- FilePathList[[1]][FilePathListLength]
sprintf("The File Name is: ")
sprintf("%s", FileName)

## -------------------------- File Name Module Ended --------------------------


# e.g. 2.1  names.txt
#    姓名 性别   教育   籍贯 年龄 观点
#    王芳   女   大学   北京 62   是
#  李泽娜   女   大学   山东 57   否
#    刘伟   男   中学  河北  19   是
#    刘东   男   大学   河北 29   是
#    李锐   男   中学   北京   15   否
#  张节福   男   研究生  北京 41   是
#  赵思雨   男   研究生   山东 48   否
#  唐慧聪   女   中学   山东 47   否
#  熊爱珊   女  大学    山东 23   是
#    王冰   男   大学 河北   23   否

w=read.table("Data/Chapter02/names.txt",header=T)
# filter variable 1, 4, 5: 姓名 籍贯 年龄
v=w[,-c(1,4,5)]
tt=table(v)
# flat table
ftable(tt)
# 观点 否 是
# 性别 教育
# 男   大学         1  1
# 研究生       1  1
# 中学         1  1
# 女   大学         1  2
# 研究生       0  0
# 中学         1  0

ftable(tt,col.vars=c(1,3))
# equal statement to ftable(tt,col.vars=c(1,3))
ftable(tt,col.vars=c("性别","观点"))
#        性别 男    女
#        观点 否 是 否 是
# 教育
# 大学         1  1  1  2
# 研究生       1  1  0  0
# 中学         1  1  1  0

```{r }
# 2 dimension flat table
ftable(tt,col.vars=3,row.vars=2)
#        观点 否 是
# 教育
# 大学         2  3
# 研究生       1  1
# 中学         2  1

ftable(tt, row.vars=3, col.vars=2)
#      教育 大学 研究生 中学
# 观点
# 否           2      1    2
# 是           3      1    1

```

# xtabs() cross table
xtabs(~., v)   # same as tt, same as table(v)
# , , 观点 = 否
#
#     教育
# 性别 大学 研究生 中学
#   男    1      1    1
#   女    1      0    1
#
# , , 观点 = 是
#
#     教育
# 性别 大学 研究生 中学
#   男    1      1    1
#   女    2      0    0
xtabs(~性别+观点, v)
#     观点
# 性别 否 是
#   男  3  3
#   女  2  2


# e.g. 2.2
# read data
w=read.csv("Data/Chapter02/Rich.csv",header=T)
#print head 20 lines
w[1:20, ]
head(w, 20)
# print 3x3
w[1:3, 1:3]
# names(w), write muti-statement in one line
names(w);
# [1] "Rank"      "Name"      "Net.Worth" "Age"       "Source"    "Residency"
summary(w);str(w)
#sort by Residency and retrieve top 10
v=rev(sort(table(w[,6])))[1:10]
#sort by Source and retrieve top 10
u=rev(sort(table(w[,5])))[1:10]
# setting margin
rep(0, 4)
op <- par(mar = rep(0, 4))
#op
# $mar
# [1] 5.1 4.1 4.1 2.1
plot.new()
par(op) #setting margin
# side-by-side 1 Row X 2 Column
par(mfrow=c(1,2))
# pie chart Top 10 by residency
pie(v,cex.names=.8,main="by residency")
# pie chart Top 10 by source
pie(u,cex.names=.8,main="by source")

# paralleling 2 Row X 1 Column
par(mfrow=c(2,1))
barplot(v,cex.names=.8,main="by residency")
barplot(u,cex.names=.8,main="by source")
#set back to default 1 x 1
par(mfrow=c(1,1))


#e.g. 2.3 global top 2000 Company
w=read.csv("Data/Chapter02/Forbes2000.csv",header=T)
names(w);summary(w)
# [1] "Rank"         "Company"      "Country"      "Sales"
# [5] "Profits"      "Assets"       "Market.Value"

#draw 4 histogram, data do log transform
par(mfrow=c(2,2))
for(i in 4:7){
  hist(log(w[,i]),main=paste("log",names(w)[i]),xlab="")
  rug(log(w[,i]))
}

# box plot, box-and-whisker plot
# draw china companies market value, horizontal positioned
par(mfrow=c(1,1))
boxplot(w[w[,3]=="China",7],horizontal=T,
        main="market value")
rug(w[w[,3]=="China",7])

stem(w[w[,3]=="China",7])

v=w[1:100,]
plot(v$Assets,v$Sales,pch=1,col=1,
     xlim=c(-500,3000),ylim=c(0,600),
     cex=sqrt(v$Profits))
identify(v$Assets,v$Sales,labels=v$Company)

C=w[w[,3]=="China",]
G=w[w[,3]=="Germany",]
par(mfrow=c(1,2))
hist(C$Market.Value,20,main="histogram of market value(China)",
     xlab="market value",ylab="density",
     col=3,prob=T,ylim=c(0,0.07))
lines(density(C$Market.Value),lwd=2)
hist(G$Market.Value,20,main="histogram of market value(Germany)",
     xlab="market value",ylab="density",
     col=3,prob=T,ylim=c(0,0.07))
lines(density(G$Market.Value),lwd=2)
par(mfrow=c(1,1))

w=scan("soi.txt")
w=ts(w,start=1950,frequency=12)
plot(w,ylab="SOI")
abline(h=0,lty=2)
title("the southern oscillation index 1950-1995")

w=read.table("Data/Chapter02/USIP.txt",header=T)
v=ts(w[,-c(1,2)],start=c(1947,1),frequency=12)
ts.plot(v,lty=1:8,col=1:8,ylab="indices",xlab="time")#若用plot，得出8个图而不是放在一起的一个图
title("US indices of industrial production")
legend("topleft",legend=names(w)[3:10],lty=1:8,col=1:8)

w=read.csv("Data/Chapter02/Salinity.csv",header=T)
attach(w)
plot(Waterflow,Salinity)
title=("Salinity")
identify(Waterflow,Salinity,labels=rownames(w))
detach(w)

x=scan("Data/Chapter02/income.txt")#若用read.table函数，则x是数据框，不能直接求平均，需将x向量化
mean(x)
median(x)

w=read.table("Data/Chapter02/Acorn.txt",sep=",",header=T)#数据是一逗号隔开的
mean(w[,4]);median(w[,4])
hist(w[,4],prob=T,
     xlab="Acorn size",main="Acorn size")
rug(w[,4])
arrows(5,0.2,median(w[,4]),0)
arrows(10,0.2,mean(w[,4]),0)
text(locator(2),c("meadian=1.8","mean=3.34"))

C=w[w[,2]=="California",4]
A=w[w[,2]=="Atlantic",4]
summary(C);summary(A)
fivenum(C);fivenum(A)
boxplot(Acorn_size~Region,w,horizontal=T)
par(mfrow=c(1,2))
hist(C,12,prob=T,
     xlab="Acorn size",main="California",
     xlim=c(0,18),ylim=c(0,0.5))
rug(C);lines(density(C))
hist(A,12,prob=T,
     xlab="Acorn size",main="Atlantic",
     xlim=c(0,18),ylim=c(0,0.5))
rug(A);lines(density(A))
par(mfrow=c(1,1))

#习题
#1
w=read.csv("Data/Chapter02/Old.csv",header=T)
dim(w)
nrow(w)-nrow(na.omit(w))
names(w)
hist(w[,5]);boxplot(w[,5],horizontal=T)
table(w[,5])
barplot(table(w[,5]))
pie(table(w[,5]))
stem(w[,5])
v=na.omit(w)
v[v[,5]>=15,1]
v[order(v[,5],decreasing=T),]
v[v[,1]=="China",]
median(v[,5])
mean(v[,5])

#3
w=read.table("Data/Chapter02/chismoke.dat",header=T)
names(w)
x0=xtabs(Count~.,w);x0;dim(x0)
attributes(x0)
x1=xtabs(Count~.,w[,2:4]);x1
x2=xtabs(Count~.,w[,-2]);x2
#实验数据

#4
(164-157)/qnorm(0.9)
pnorm(175,167.8,5.61819)
1-pnorm(226,167.8,5.61819)
1-pnorm(160,167.8,5.61819)
pnorm(162,157,5.462129)
1-pnorm(180,167.8,5.61819);1-pnorm(175,157,5.462129)
36人阅读
> 侯祥胡的所有笔记（6篇）
侯祥胡对本书的所有笔记 · · · · · ·

目录

统计学-基于R的应用本书目录第 1 章通过来学统计很容易 1.1统计是什么? 学统计需要什么? 1...
第 2 章数据及其模式
> 查看全部3篇
说明 · · · · · ·

表示其中内容是对原文的摘抄
第 2 章 数据及其模式

侯祥胡 (思行勤业)

侯祥胡对本书的所有笔记 · · · · · ·

目录

通过来学统计很容易

第 2 章 数据及其模式

说明 · · · · · ·

第 2 章数据及其模式

第 2 章数据及其模式