第94页 Chapter 4 Ranking: Priority Inbox
- 章节名:Chapter 4 Ranking: Priority Inbox
- 页码:第94页
email.thread <- function(threads.matrix) { senders <- threads.matrix[, 1] # 1 senders.freq <- table(senders) # 2 senders.matrix <- cbind(names(senders.freq), senders.freq, log(senders.freq + 1)) # 3 senders.df <- data.frame(senders.matrix, stringsAsFactors=FALSE)# 4 row.names(senders.df) <- 1:nrow(senders.df) # 5 names(senders.df) <- c("From.EMail", "Freq", "Weight") # 6 senders.df$Freq <- as.numeric(senders.df$Freq) # 7 senders.df$Weight <- as.numeric(senders.df$Weight) # 8 return(senders.df) # 9 } 引自 Chapter 4 Ranking: Priority Inbox # 1: 取出threads.matrix的第一列 # 2: 用table求出频数来,存在senders.freq # 3: 造出3列,分别是senders.freq名字,senders.freq, 权重。看一看: > head(senders.matrix,1) senders.freq adam@homeport.org "adam@homeport.org" "1" "0.693147180559945" # 4: 转化成 data.frame > head(senders.df,1) V1 senders.freq V3 adam@homeport.org adam@homeport.org 1 0.693147180559945 # 5: 改行名,用数字代替。 # 6: 改名。 # 7: 数值化。 # 8: 数值化。 # 9: 输出。 > senders.df[1,] From.EMail Freq Weight 1 adam@homeport.org 1 0.6931472 已经很好了。 ##################### thread.counts
thread.counts <- function(thread, email.df) { # Need to check that we are not looking at the original message in a thread, # so we check the subjects against the 're:' cue. thread.times <- email.df$Date[which(email.df$Subject == thread | email.df$Subject == paste("re:", thread))] # 1 freq <- length(thread.times) # 2 min.time <- min(thread.times) # 3 max.time <- max(thread.times) # 4 time.span <- as.numeric(difftime(max.time, min.time, units = "secs")) # 5 if(freq < 2) { return(c(NA, NA, NA)) } else { trans.weight <- freq / time.span # 6 log.trans.weight <- 10 + log(trans.weight, base = 10) # 7 return(c(freq, time.span, log.trans.weight)) # 8 } } 引自 Chapter 4 Ranking: Priority Inbox # 1: 不是要处理所有的电邮,而只是处理电邮串(thread)。所以找出含有thread中的主题的邮件时间,以及前缀re: + 主题的邮件的时间。 以下是我的测试 > threads <- unique(threads.matrix[, 2]) > thread = threads[[1]] > thread [1] "please help a newbie compile mplayer :-)" > thread.times <- email.df$Date[which(email.df$Subject == thread | + email.df$Subject == paste("re:", thread))] > thread.times [1] "2002-01-31 22:44:14 MYT" "2002-02-01 00:53:41 MYT" [3] "2002-02-01 02:01:44 MYT" "2002-02-01 10:29:23 MYT" # 2: 次数。 上例中 freq = 4 # 3: 最早时间。 上例中 min.time "2002-01-31 22:44:14 MYT" # 4: 最后时间。 上例中 max.time "2002-02-01 10:29:23 MYT" # 5: 最早,最后时间间隔多少秒。 上例中time.span = 42309 Secs # 6: 少于2次的,就输出Na。 多的就算 次数/间隔时间 # 7: 取对数 + 10,为的是不会有负数出现。 # 8: 输出 ############################ get.threads
get.threads <- function(threads.matrix, email.df) { threads <- unique(threads.matrix[, 2]) # 1 thread.counts <- lapply(threads, function(t) thread.counts(t, email.df)) # 2 thread.matrix <- do.call(rbind, thread.counts) # 3 return(cbind(threads, thread.matrix)) # 4 } 引自 Chapter 4 Ranking: Priority Inbox # 1 threads.matrix里面是两列,第一列是送信人,第二列是主题。 所以 # 2 中用到 thread.counts 计算。最后并到主题,成为输出。 thread.weights <- get.threads(threads.matrix, priority.train) # 1 thread.weights <- data.frame(thread.weights, stringsAsFactors = FALSE) # 2 names(thread.weights) <- c("Thread", "Freq", "Response", "Weight") # 3 thread.weights$Freq <- as.numeric(thread.weights$Freq) # 4 thread.weights$Response <- as.numeric(thread.weights$Response) # 5 thread.weights$Weight <- as.numeric(thread.weights$Weight) # 6 thread.weights <- subset(thread.weights, is.na(thread.weights$Freq) == FALSE) # 7 终于把数据带入等到结果。 # 7 保证只要邮件串中次数不为零的。 ######################### term.counts
term.counts <- function(term.vec, control) { vec.corpus <- Corpus(VectorSource(term.vec)) vec.tdm <- TermDocumentMatrix(vec.corpus, control = control) return(rowSums(as.matrix(vec.tdm))) } 引自 Chapter 4 Ranking: Priority Inbox 这个和第三章的提到的一样,就是找出语料库矩阵。 thread.terms <- term.counts(thread.weights$Thread, control = list(stopwords = TRUE)) > tail(thread.terms) xdegress xml-in-silicon yahoo you? 1 1 1 1 young zip 1 1 > head(thread.terms) 'apt-get --with "holiday "ouch. "requires:" (almost) 1 1 1 1 1 1 thread.terms <- names(thread.terms) > head(thread.terms) [1] "'apt-get" "--with" "\"holiday" "\"ouch." [5] "\"requires:\"" "(almost)" > thread.terms [1] "'apt-get" "--with" [245] "low" "machine" [247] "made" "mail" [249] "mail." "maildrop" [251] "major" "make" [291] "one," "oops" [293] "operators" "option" [295] "ot:" "ouch." [297] "ouch..." "ouch...)" [299] "ouch....\"(was" "p2p/distributed" [347] "relational" "release" [349] "release:" "release?" [351] "released" "removed" [353] "removing" "report" [355] "requested" "results" [357] "review" "revocation" [383] "signers" "site" [385] "slaughter" "slow" [387] "socialism" "sofware" [463] "word" "working!!" [465] "working," "wrold," [467] "xdegress" "xml-in-silicon" [469] "yahoo" "you?" [471] "young" "zip" > t= thread.terms[[388]]; grepl(t, thread.weights$Thread, fixed = TRUE) [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [11] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [21] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [31] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [51] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE [171] FALSE FALSE FALSE FALSE FALSE FALSE term.weights <- sapply(thread.terms, function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)])) 这句的目的是为了找出所有含有该词的电邮串,把他们的权重取平均作为它的新权重。 term.weights <- data.frame(list(Term = names(term.weights), Weight = term.weights), stringsAsFactors = FALSE, row.names = 1:length(term.weights)) > head(thread.weights) Thread Freq Response 1 please help a newbie compile mplayer :-) 4 42309 2 prob. w/ install/uninstall 4 23745 3 http://apt.nixia.no/ 10 265303 4 problems with 'apt-get -f install' 3 55960 5 problems with apt update 2 6347 6 about apt, kernel updates and dist-upgrade 5 240238 Weight 1 5.975627 2 6.226488 3 5.576258 4 5.729244 5 6.498461 6 5.318328 对全体训练数据的邮件内容进行词条抽取。 也按频数的对数给予权重。 msg.terms <- term.counts(priority.train$Message, control = list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE)) msg.weights <- data.frame(list(Term = names(msg.terms), Weight = log(msg.terms, base = 10)), stringsAsFactors = FALSE, row.names = 1:length(msg.terms)) # Remove words that have a zero weight msg.weights <- subset(msg.weights, Weight > 0) > head(msg.weights) Term Weight 10 ‘i’m 0.3010300 13 ‘look 0.3010300 15 ‘mr 0.4771213 26 “and 0.7781513 30 “as 0.3010300 46 “for 0.3010300 > tail(msg.weights) Term Weight 20893 zol 0.7781513 20894 zone 0.8450980 20896 zones 0.4771213 20897 zoo 0.3010300 20898 zoomable 0.3010300 20899 zope 0.9030900 #Zope是一个开源面向对象的web应用服务器,用Python写成。 到现在我们已经有5个权重方案了。 from.weight(来信的活跃度) senders.df (送信人的活跃度) thread.weights (电邮串的活跃度) term.weights (在活跃电邮串中的词条) msg.weight (普通词条) ############################ 主程序
rank.message <- function(path) { msg <- parse.email(path) # Weighting based on message author # First is just on the total frequency from <- ifelse(length(which(from.weight$From.EMail == msg[2])) > 0, from.weight$Weight[which(from.weight$From.EMail == msg[2])], 1) # Second is based on senders in threads, and threads themselves thread.from <- ifelse(length(which(senders.df$From.EMail == msg[2])) > 0, senders.df$Weight[which(senders.df$From.EMail == msg[2])], 1) subj <- strsplit(tolower(msg[3]), "re: ") is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE) if(is.thread) { activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE) } else { activity <- 1 } # Next, weight based on terms # Weight based on terms in threads thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE)) thread.terms.weights <- get.weights(thread.terms, term.weights) # Weight based terms in all messages msg.terms <- term.counts(msg[4], control = list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE)) msg.weights <- get.weights(msg.terms, msg.weights) # Calculate rank by interacting all weights rank <- prod(from, thread.from, activity, thread.terms.weights, msg.weights) return(c(msg[1], msg[2], msg[3], rank)) } 引自 Chapter 4 Ranking: Priority Inbox
windy guo对本书的所有笔记 · · · · · ·
-
第93页 Chapter 4 Ranking: Priority Inbox
第三章Classification : Spam Filtering涉及的是二元分类器的问题,比如判断一个人的性别(只...
-
第73页 Classification : Spam Filtering
# Get all the SPAM-y email into a single vector spam.docs <- dir(spam.path) #1 spam...
-
第94页 Chapter 4 Ranking: Priority Inbox
-
第127页 Chapter 5: Rgression
这一章讲的是回归,相对的简单。 一个例子是抽烟者和不抽烟者的寿命数据。求何时MSE(mean sq...
说明 · · · · · ·
表示其中内容是对原文的摘抄