Question

使用MALLET进行专题分析,该分析结果产生于几千个浏览器(“topics.txt”)和100个或100个以上浏览器,每个浏览量包含诸如以下的表形变量:

Num1 text1 topic1 proportion1 topic2 proportion2 topic3 proportion3,  etc.
Num2 text2 topic1 proportion1 topic2 proportion2 topic3 proportion3,  etc.
Num3 text3 topic1 proportion1 topic2 proportion2 topic3 proportion3,  etc.

Here s a snippet of the actual data:

> dat[1:5,1:10]

  V1 V2 V3    V4 V5        V6 V7        V8 V9        V10
1  0 10.txt   27 0.4560785 23 0.3040853 20 0.1315621 21 0.03632624
2  1 1001.txt 20 0.2660085 12 0.2099153  8 0.1699586 13 0.16922928
3  2 1002.txt 16 0.3341721  2 0.1747023 10 0.1360454 12 0.07507119
4  3 1003.txt 12 0.5366148  8 0.2255179 18 0.1388561  0 0.01867091
5  4 1005.txt 16 0.2363206  0 0.2214441 24 0.1914769  7 0.17760521

I m试图利用R将这一产出转换成一个数据表,其中各专题为一栏标题,每个专题都包含与每个变题右侧直接成比例的数值,每个数值为: 与此类似:

      topic1       topic2       topic3
text1 proportion1  proportion2  proportion3
text2 proportion1  proportion2  proportion3

或采用以上数据表:

           0         2         7         8         10        12        13        16        18       20        21         23        24         27
10.txt     0         0         0         0         0         0         0         0         0        0.1315621 0.03632624 0.3040853 0          0.4560785        
1001.txt   0         0         0         0.1699586 0         0.2099153 0.1692292 0         0        0.2660085 0          0         0          0
1002.txt   0         0.1747023 0         0         0.1360454 0.0750711 0         0.3341721 0        0         0          0         0          0
1003.txt   0.0186709 0         0         0.2255179 0         0.5366148 0         0         0.138856 0         0          0         0          0
1005.txt   0.2214441 0         0.1776052 0         0         0         0         0.2363206 0        0         0          0         0.1914769  0

这是我从朋友那里传来的工作的“/strong>守则”,但我没有工作(我不知道如何做):

##########################################
dat<-read.table("topics.txt", header=F, sep="	")
datnames<-subset(dat, select=2)
dat2<-subset(dat, select=3:length(dat))
y <- data.frame(topic=character(0),proportion=character(0),text=character(0))
for(i in seq(1, length(dat2), 2)){ 
z<-i+1
x<-dat2[,i:z]
x<-cbind(x, datnames)
colnames(x)<-c("topic","proportion", "text")
y<-rbind(y, x)
}

# Right at this step at the end of the block 
# I get this message that may indicate the problem:
# Error in c(in c("topic", "proportion", "text") : unused argument(s) ("text")

y[is.na(y)] <- 0 
xdat<-xtabs(proportion ~ text+topic, data=y)  
write.table(xdat, file="topicMatrix.txt", sep="	", eol = "
", quote=TRUE, col.names=TRUE, row.names=TRUE)
##########################################

我非常感谢就如何使这一守则发挥作用提出的任何建议。我的问题可能与以下网站有关:, 这份内容是,可能的话是, 这份内容是,但我尚未掌握立即利用这些答案的技能。

Answer 1

Here is one approach to your problem

 dat <-read.table(as.is = TRUE, header = FALSE, textConnection(
  "Num1 text1 topic1 proportion1 topic2 proportion2 topic3 proportion3
   Num2 text2 topic1 proportion1 topic2 proportion2 topic3 proportion3
   Num3 text3 topic1 proportion1 topic2 proportion2 topic3 proportion3"))

 NTOPICS = 3 
 nam <- c( num ,  text , 
   paste(c( topic ,  proportion ), rep(1:NTOPICS, each = 2), sep = ""))

 dat_l <- reshape(setNames(dat, nam), varying = 3:length(nam), direction =  long ,
   sep = "")
 reshape2::dcast(dat_l, num + text ~ topic, value_var =  proportion )

num  text      topic1      topic2      topic3
1 Num1 text1 proportion1 proportion2 proportion3
2 Num2 text2 proportion1 proportion2 proportion3
3 Num3 text3 proportion1 proportion2 proportion3

EDIT. This will work irrespective of whether the proportions are text or numbers. You can also modify NTOPICS to suit the number of topics you have

Answer 2

You can get this into a long format but to go further required real data. EDITED after data offered. Still not sure about the overall structure of what is coming from MALLET, but at least the R functions are demonstrated. This approach has the "feature" that proportions are summed if there are overlapping topics. Depending on the data layout that may be an advantage or not.

dat <-read.table(textConnection("  V1 V2 V3  V4 V5  V6 V7  V8 V9  V10
1  0 10.txt   27 0.4560785 23 0.3040853 20 0.1315621 21 0.03632624
2  1 1001.txt 20 0.2660085 12 0.2099153  8 0.1699586 13 0.16922928
3  2 1002.txt 16 0.3341721  2 0.1747023 10 0.1360454 12 0.07507119
4  3 1003.txt 12 0.5366148  8 0.2255179 18 0.1388561  0 0.01867091
5  4 1005.txt 16 0.2363206  0 0.2214441 24 0.1914769  7 0.17760521
"), 
          header=TRUE)
 ldat <- reshape(dat, idvar=1:2, varying=list(topics=c("V3", "V5", "V7", "V9"), 
                                          props=c("V4", "V6", "V8", "V10")), 
                       direction="long")
####------------------####
    > ldat
             V1       V2 time V3         V4
0.10.txt.1    0   10.txt    1 27 0.45607850
1.1001.txt.1  1 1001.txt    1 20 0.26600850
2.1002.txt.1  2 1002.txt    1 16 0.33417210
3.1003.txt.1  3 1003.txt    1 12 0.53661480
4.1005.txt.1  4 1005.txt    1 16 0.23632060
0.10.txt.2    0   10.txt    2 23 0.30408530
1.1001.txt.2  1 1001.txt    2 12 0.20991530
2.1002.txt.2  2 1002.txt    2  2 0.17470230
3.1003.txt.2  3 1003.txt    2  8 0.22551790
4.1005.txt.2  4 1005.txt    2  0 0.22144410
0.10.txt.3    0   10.txt    3 20 0.13156210
1.1001.txt.3  1 1001.txt    3  8 0.16995860
2.1002.txt.3  2 1002.txt    3 10 0.13604540
3.1003.txt.3  3 1003.txt    3 18 0.13885610
4.1005.txt.3  4 1005.txt    3 24 0.19147690
0.10.txt.4    0   10.txt    4 21 0.03632624
1.1001.txt.4  1 1001.txt    4 13 0.16922928
2.1002.txt.4  2 1002.txt    4 12 0.07507119
3.1003.txt.4  3 1003.txt    4  0 0.01867091
4.1005.txt.4  4 1005.txt    4  7 0.17760521

现在可以表明你如何使用xtabs,因为这些“proportions”是“数字”。类似情况最终可能是你想要的。我感到惊讶的是,这些专题也令人愤怒,但也许从专题编号到专题名称有图示?

> xtabs(V4 ~ V3 + V2, data=ldat)
    V2
V3       10.txt   1001.txt   1002.txt   1003.txt   1005.txt
  0  0.00000000 0.00000000 0.00000000 0.01867091 0.22144410
  2  0.00000000 0.00000000 0.17470230 0.00000000 0.00000000
  7  0.00000000 0.00000000 0.00000000 0.00000000 0.17760521
  8  0.00000000 0.16995860 0.00000000 0.22551790 0.00000000
  10 0.00000000 0.00000000 0.13604540 0.00000000 0.00000000
  12 0.00000000 0.20991530 0.07507119 0.53661480 0.00000000
  13 0.00000000 0.16922928 0.00000000 0.00000000 0.00000000
  16 0.00000000 0.00000000 0.33417210 0.00000000 0.23632060
  18 0.00000000 0.00000000 0.00000000 0.13885610 0.00000000
  20 0.13156210 0.26600850 0.00000000 0.00000000 0.00000000
  21 0.03632624 0.00000000 0.00000000 0.00000000 0.00000000
  23 0.30408530 0.00000000 0.00000000 0.00000000 0.00000000
  24 0.00000000 0.00000000 0.00000000 0.00000000 0.19147690
  27 0.45607850 0.00000000 0.00000000 0.00000000 0.00000000

Answer 3

回到这一问题上,我发现<代码>reshape的功能在记忆上太高,因此我使用<代码>数据。还有几个步骤,但数额更快,记忆紧张程度大大降低。

dat <- read.table(text = "V1 V2 V3    V4 V5        V6 V7        V8 V9        V10
1  0 10.txt   27 0.4560785 23 0.3040853 20 0.1315621 21 0.03632624
2  1 1001.txt 20 0.2660085 12 0.2099153  8 0.1699586 13 0.16922928
3  2 1002.txt 16 0.3341721  2 0.1747023 10 0.1360454 12 0.07507119
4  3 1003.txt 12 0.5366148  8 0.2255179 18 0.1388561  0 0.01867091
5  4 1005.txt 16 0.2363206  0 0.2214441 24 0.1914769  7 0.17760521")

dat$V11 <- rep(NA, 5) # my real data has this extra unwanted col
dat <- data.table(dat)

# get document number
docnum <- dat$V1
# get text number
txt <- dat$V2

# remove doc num and text num so we just have topic and props
dat1 <- dat[ ,c("V1","V2", paste0("V", ncol(dat))) := NULL]
# get topic numbers
n <- ncol(dat1) 
tops <- apply(dat1, 1, function(i) i[seq(1, n, 2)])
# get props 
props <- apply(dat1, 1, function(i) i[seq(2, n, 2)])

# put topics and props together
tp <- lapply(1:ncol(tops), function(i) data.frame(tops[,i], props[,i]))
names(tp) <- txt
# make into long table
dt <- data.table::rbindlist(tp)
dt$doc <- unlist(lapply(txt, function(i) rep(i, ncol(dat1)/2)))
dt$docnum <- unlist(lapply(docnum, function(i) rep(i, ncol(dat1)/2)))

# reshape to wide
library(data.table)
setkey(dt, tops...i., doc)
out <- dt[CJ(unique(tops...i.), unique(doc))][, as.list(props...i.), by=tops...i.]
setnames(out, c("topic", as.character(txt)))

# transpose to have table of docs (rows) and columns (topics) 
tout <- data.table(t(out))
setnames(tout, unname(as.character(tout[1,])))
tout <- tout[-1,]
row.names(tout) <- txt 

# replace NA with zero
tout[is.na(tout)] <- 0

这里的产出是:斜体字,栏目中的题目,斜体名称,不是印刷的,而是可供日后使用。

tout

            0         2         7         8        10         12        13        16        18
1: 0.00000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
2: 0.00000000 0.0000000 0.0000000 0.1699586 0.0000000 0.20991530 0.1692293 0.0000000 0.0000000
3: 0.00000000 0.1747023 0.0000000 0.0000000 0.1360454 0.07507119 0.0000000 0.3341721 0.0000000
4: 0.01867091 0.0000000 0.0000000 0.2255179 0.0000000 0.53661480 0.0000000 0.0000000 0.1388561
5: 0.22144410 0.0000000 0.1776052 0.0000000 0.0000000 0.00000000 0.0000000 0.2363206 0.0000000
          20         21        23        24        27
1: 0.1315621 0.03632624 0.3040853 0.0000000 0.4560785
2: 0.2660085 0.00000000 0.0000000 0.0000000 0.0000000
3: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
4: 0.0000000 0.00000000 0.0000000 0.0000000 0.0000000
5: 0.0000000 0.00000000 0.0000000 0.1914769 0.0000000

友情链接