帖子标题字符长度分布

偶然看到YIHUI大神提供的这个数据,试着拟合了一下,感觉好像更符合NB啊。






代码如下:
(借用了评论1里的画图脚本)

len <- c(20,20,13,15,2,11,31,10,12,20,13,56,7,13,19,46,16,19,14,9,20,10,22,13,2,43,11,15,20,14,26,10,19,33,15,15,65,7,16,18,10,32,14,17,14,24,19,60,13,17,27,7,12,7,11,70,50,8,13,8,15,2,20,27,39,7,7,26,21,19,22,8,26,42,8,17,37,17,5,14,21,8,28,18,69,12,23,12,17,14,17,8,20,31,36,25,20,6,6,11)

mean(len)
sd(len)
hist(len, 20)


ll.pois <- function(p) {
    mu <- p[1]
    ll <- sum(dpois(len, lambda = mu, log = T))
    -ll
}

ll.norm <- function(p) {
    mu <- p[1]
    sigma2 <- p[2]
    ll <- sum(dnorm(len, mean = mu, sd = sigma2, log = T))
    -ll
}

ll.nb <- function(p) {
    mu <- p[1]
    theta <- p[2]
    ll <- sum(dnbinom(len, mu=mu, size = theta, log=T))
    -ll
}

out.pois <- nlm(ll.pois, 2)
out.norm <- nlm(ll.norm, c(2,1))
out.nb <- nlm(ll.nb, c(2,1))


hist(len, breaks = 50, col = "blue", border = "blue",
     freq = FALSE)
lines(density(len), col = "blue")
x = seq(1, 100, 1)
lines(x, dnorm(x, mean = out.norm$estimate[1], sd = out.norm$estimate[2]),
      type = "l", col = "red")
lines(x, dpois(x, lambda = out.pois$estimate[1]), type = "l", col = "black")
lines(x, dnbinom(x, mu = out.nb$estimate[1], size = out.nb$estimate[2]), type = "l", col = "green")
legend(50, 0.07, c(" 数据密度", "正态分布", "泊松分布", "负二项分布"),
       text.col = c("blue", "red", "black", "green"))



评论

此博客中的热门博文

R包编写详细教程

Hadley Wickham的R语言编写规范

RMarkdown中文报错的问题【解决】