我有一栏CSV。 累计成本一栏、成本一栏和关键词栏。 R 笔记为小案卷,但当我拿到实际档案(有100万个牢房)时,全然死亡。 你们能否帮助我提高这一说法的效率? The Token. 我正在制造麻烦。 谢谢!
# Token Histogram
# Import CSV data from Report Downloader API Feed
Mydf <- read.csv("Output_test.csv.csv", sep=",", header = TRUE, stringsAsFactors=FALSE)
# Helps limit the dataframe according the HTT
# Change number to:
# .99 for big picture
# .8 for HEAD
limitor <- Mydf$CumuCost <= .8
# De-comment to ONLY measure TORSO
#limitor <- (Mydf$CumuCost <= .95 & Mydf$CumuCost > .8)
# De-comment to ONLY measure TAIL
#limitor <- (Mydf$CumuCost <= 1 & Mydf$CumuCost > .95)
# De-comment to ONLY measure Non-HEAD
#limitor <- (Mydf$CumuCost <= 1 & Mydf$CumuCost > .8)
# Creates a column with HTT segmentation labels
# Creates a dataframe
HTT <- data.frame()
# Populates dataframe according to conditions
HTT <- ifelse(Mydf$CumuCost <= .8,"HEAD",ifelse(Mydf$CumuCost <= .95,"TORSO","TAIL"))
# Add the column to Mydf and rename it HTT
Mydf <- transform(Mydf, HTT = HTT)
# Count all KWs in account by using the dimension function
KWportfolioSize <- dim(Mydf)[1]
# Percent of portfolio
PercentofPortfolio <- sum(limitor)/KWportfolioSize
# Length of Keyword -- TOO SLOW
# Uses the Tau package
# My function takes the row number and returns the number of tokens
library(tau)
Myfun = function(n) {
sum(sapply(Mydf$Keyword.text[n], textcnt, split = "[[:space:][:punct:]]+", method = "string", n = 1L))}
# Creates a dataframe to hold the results
Token.Count <- data.frame()
# Loops until last row and store it in data.frame
for (i in c(1:dim(Mydf)[1])) {Token.Count <- rbind(Token.Count,Myfun(i))}
# Add the column to Mydf
Mydf <- transform(Mydf, Token.Count = Token.Count)
# Not quite sure why but the column needs renaming in this case
colnames(Mydf)[dim(Mydf)[2]] <- "Token.Count"