帖子标题字符长度分布
偶然看到YIHUI大神提供的这个数据,试着拟合了一下,感觉好像更符合NB啊。
代码如下:
(借用了评论1里的画图脚本)
len <- c(20,20,13,15,2,11,31,10,12,20,13,56,7,13,19,46,16,19,14,9,20,10,22,13,2,43,11,15,20,14,26,10,19,33,15,15,65,7,16,18,10,32,14,17,14,24,19,60,13,17,27,7,12,7,11,70,50,8,13,8,15,2,20,27,39,7,7,26,21,19,22,8,26,42,8,17,37,17,5,14,21,8,28,18,69,12,23,12,17,14,17,8,20,31,36,25,20,6,6,11)
mean(len)
sd(len)
hist(len, 20)
ll.pois <- function(p) {
mu <- p[1]
ll <- sum(dpois(len, lambda = mu, log = T))
-ll
}
ll.norm <- function(p) {
mu <- p[1]
sigma2 <- p[2]
ll <- sum(dnorm(len, mean = mu, sd = sigma2, log = T))
-ll
}
ll.nb <- function(p) {
mu <- p[1]
theta <- p[2]
ll <- sum(dnbinom(len, mu=mu, size = theta, log=T))
-ll
}
out.pois <- nlm(ll.pois, 2)
out.norm <- nlm(ll.norm, c(2,1))
out.nb <- nlm(ll.nb, c(2,1))
hist(len, breaks = 50, col = "blue", border = "blue",
freq = FALSE)
lines(density(len), col = "blue")
x = seq(1, 100, 1)
lines(x, dnorm(x, mean = out.norm$estimate[1], sd = out.norm$estimate[2]),
type = "l", col = "red")
lines(x, dpois(x, lambda = out.pois$estimate[1]), type = "l", col = "black")
lines(x, dnbinom(x, mu = out.nb$estimate[1], size = out.nb$estimate[2]), type = "l", col = "green")
legend(50, 0.07, c(" 数据密度", "正态分布", "泊松分布", "负二项分布"),
text.col = c("blue", "red", "black", "green"))
代码如下:
(借用了评论1里的画图脚本)
len <- c(20,20,13,15,2,11,31,10,12,20,13,56,7,13,19,46,16,19,14,9,20,10,22,13,2,43,11,15,20,14,26,10,19,33,15,15,65,7,16,18,10,32,14,17,14,24,19,60,13,17,27,7,12,7,11,70,50,8,13,8,15,2,20,27,39,7,7,26,21,19,22,8,26,42,8,17,37,17,5,14,21,8,28,18,69,12,23,12,17,14,17,8,20,31,36,25,20,6,6,11)
mean(len)
sd(len)
hist(len, 20)
ll.pois <- function(p) {
mu <- p[1]
ll <- sum(dpois(len, lambda = mu, log = T))
-ll
}
ll.norm <- function(p) {
mu <- p[1]
sigma2 <- p[2]
ll <- sum(dnorm(len, mean = mu, sd = sigma2, log = T))
-ll
}
ll.nb <- function(p) {
mu <- p[1]
theta <- p[2]
ll <- sum(dnbinom(len, mu=mu, size = theta, log=T))
-ll
}
out.pois <- nlm(ll.pois, 2)
out.norm <- nlm(ll.norm, c(2,1))
out.nb <- nlm(ll.nb, c(2,1))
hist(len, breaks = 50, col = "blue", border = "blue",
freq = FALSE)
lines(density(len), col = "blue")
x = seq(1, 100, 1)
lines(x, dnorm(x, mean = out.norm$estimate[1], sd = out.norm$estimate[2]),
type = "l", col = "red")
lines(x, dpois(x, lambda = out.pois$estimate[1]), type = "l", col = "black")
lines(x, dnbinom(x, mu = out.nb$estimate[1], size = out.nb$estimate[2]), type = "l", col = "green")
legend(50, 0.07, c(" 数据密度", "正态分布", "泊松分布", "负二项分布"),
text.col = c("blue", "red", "black", "green"))
评论
发表评论