问题
I have a dataframe, and I want to produce a table of summary statistics including number of valid numeric values, mean and sd by group for each of three columns. I can't seem to find any function to count the number of numeric values in R. I can use length() which tells me how many values there are, and I can use colSums(is.na(x)) to count the number of NA values, but colSums(is.numeric(x)) doesn't work the same way.
I could use tapply with { length - number of NA values - number of blank values - number of text values } but surely there's an easier way.
My data (I want to group by Nominal, and produce summary stats on Actual, LinPred and QualPred)
structure(list(Nominal = c(1, 3, 6, 10, 30, 50, 150, 250, 1,
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1,
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1,
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1,
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250), Actual = c(NA,
0.422, 0.782, 1.25, 3.85, 6.94, 18.8, 31.2, 0.118, 0.361, 0.747,
1.18, 3.58, 5.82, 16.7, 29, 0.113, 0.382, 0.692, 1.12, 3.51,
5.43, 17.1, 28.7, 0.134, 0.402, 0.718, 1.25, 3.65, 6.52, NA,
28.8, 0.123, 0.396, 0.664, 1.12, 3.83, 5.6, NA, 28.1, 0.112,
0.341, 0.7, 1.08, 3.25, 5.97, NA, 27.1, 0.106, 0.35, 0.674, 1.14,
3.28, 5.5, 17.3, 30, 0.122, 0.321, 0.673, 1.22, 3.41, 5.85, 17.6,
28.1, 0.129, 0.351, 0.737, 1.06, 3.39, 5.53, 15.9, 28.5), LinPred = c(NA,
3.49519490135683, 6.4706724568458, 10.3387932789814, 31.8283534019573,
57.3678690865708, 155.393324109068, 257.881995464799, 0.982569410055046,
2.99101676001009, 6.18138991672881, 9.76022819874748, 29.5967452353405,
48.1108278028274, 138.036371702049, 239.698521514589, 0.941243332895477,
3.16458628408028, 5.72680306797355, 9.26431527283265, 29.0181801551066,
44.887393784381, 141.342457874815, 237.218956885015, 1.07941778099747,
3.36900393602722, 6.0686652233011, 10.6136646056736, 31.1174212178803,
55.6364968333108, NA, 245.979704049963, 0.98544222985819, 3.3177445444967,
5.60733069952645, 9.50304445584572, 32.6552029637958, 47.7767234652982,
NA, 239.999441704736, 0.89146667871891, 2.8478667888003, 5.91488704870955,
9.1613151789756, 27.7001284491792, 50.9377192763467, NA, 231.456209782983,
0.887738051402174, 3.04188235451485, 5.9023034783202, 10.0163659588551,
28.9092709123842, 48.5084526866061, 152.684283738776, 264.805729023739,
1.02899341554071, 2.78585700701375, 5.89347501806154, 10.7226427795477,
30.0569707460098, 51.5984137771366, 155.332821816374, 248.031654532288,
1.09079263735132, 3.05071081477351, 6.45849647461568, 9.31008913816238,
29.8804015408367, 48.7733064943658, 140.324439376654, 251.563038635751
), QuadPred = c(NA, 3.46077095737974, 6.38659713413108, 10.1956079501556,
31.4700369979564, 57.0089799611706, 157.775316006369, 268.303966059862,
0.99289436409299, 2.96536517477853, 6.10198249392715, 9.62549220297933,
29.2517496204359, 47.7196128593832, 139.600469198163, 248.272682787657,
0.95232583127381, 3.13590297331348, 5.65480031033985, 9.13693141349813,
28.6769820181676, 44.4936547741659, 143.050878627236, 245.555818447238,
1.08417831830729, 3.33895371044810, 6.00044125019758, 10.4882228621509,
30.8451526869812, 55.4331759085967, NA, 256.446833964951, 0.991679220421247,
3.28844923081897, 5.54540949253351, 9.3907657095483, 32.3793538902883,
47.5218142460371, NA, 249.828516445647, 0.899183876120787, 2.82554368740693,
5.84875388286628, 9.05319326862309, 27.4395572248486, 50.7001828907023,
NA, 240.411024762687, 0.884412915928806, 3.05257006009469, 5.93046554432476,
10.0673979669, 29.0311859234644, 48.645035648271, 151.914544909710,
261.273991566153, 1.02660962824666, 2.79491765184684, 5.92158513760114,
10.7773327827008, 30.1813919027873, 51.7318741314584, 154.518856412401,
245.027488125567, 1.08881969774848, 3.06145444119556, 6.48990638077339,
9.35738460692028, 30.0044505131336, 48.9096796323938, 139.747394069421,
248.451100154569)), .Names = c("Nominal", "Actual", "LinPred",
"QuadPred"), row.names = c(NA, -72L), class = "data.frame")
回答1:
These are a few add-on packages that might help (see Quick-R)
Using the Hmisc package
library(Hmisc)
describe(mydata)
# n, nmiss, unique, mean, 5,10,25,50,75,90,95th percentiles
# 5 lowest and 5 highest scores
Using the pastecs package
library(pastecs)
stat.desc(mydata)
# nbr.val, nbr.null, nbr.na, min max, range, sum,
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var
Using the psych package
library(psych)
describe(mydata)
# item name ,item number, nvalid, mean, sd,
# median, mad, min, max, skew, kurtosis, se
I'd use describe.by from the psych package;
> describe.by(biastable, as.factor(Nominal))
group: 1
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN NaN 0.00
Actual 2 8 0.12 0.01 0.12 0.12 0.01 0.11 0.13 0.03 0.09 -1.47 0.00
LinPred 3 8 0.99 0.08 0.98 0.99 0.10 0.89 1.09 0.20 0.04 -1.70 0.03
QuadPred 4 8 0.99 0.08 0.99 0.99 0.10 0.88 1.09 0.20 -0.04 -1.64 0.03
------------------------------------------------------------------------
group: 3
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 3.00 0.00 3.00 3.00 0.00 3.00 3.00 0.00 NaN NaN 0.00
Actual 2 9 0.37 0.03 0.36 0.37 0.03 0.32 0.42 0.10 0.15 -1.50 0.01
LinPred 3 9 3.12 0.24 3.05 3.12 0.30 2.79 3.50 0.71 0.15 -1.52 0.08
QuadPred 4 9 3.10 0.23 3.06 3.10 0.34 2.79 3.46 0.67 0.12 -1.51 0.08
------------------------------------------------------------------------
group: 6
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 6.00 0.00 6.00 6.00 0.00 6.00 6.00 0.00 NaN NaN 0.00
Actual 2 9 0.71 0.04 0.70 0.71 0.04 0.66 0.78 0.12 0.46 -1.30 0.01
LinPred 3 9 6.02 0.30 5.91 6.02 0.28 5.61 6.47 0.86 0.28 -1.43 0.10
QuadPred 4 9 5.99 0.31 5.93 5.99 0.25 5.55 6.49 0.94 0.26 -1.26 0.10
------------------------------------------------------------------------
group: 10
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 10.00 0.00 10.00 10.00 0.00 10.00 10.00 0.00 NaN NaN 0.00
Actual 2 9 1.16 0.07 1.14 1.16 0.09 1.06 1.25 0.19 0.09 -1.71 0.02
LinPred 3 9 9.85 0.60 9.76 9.85 0.74 9.16 10.72 1.56 0.24 -1.76 0.20
QuadPred 4 9 9.79 0.62 9.63 9.79 0.72 9.05 10.78 1.72 0.27 -1.65 0.21
------------------------------------------------------------------------
group: 30
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 30.00 0.00 30.00 30.00 0.00 30.00 30.00 0.00 NaN NaN 0.00
Actual 2 9 3.53 0.22 3.51 3.53 0.21 3.25 3.85 0.60 0.23 -1.58 0.07
LinPred 3 9 30.08 1.55 29.88 30.08 1.44 27.70 32.66 4.96 0.21 -1.27 0.52
QuadPred 4 9 29.92 1.51 30.00 29.92 1.44 27.44 32.38 4.94 0.04 -1.22 0.50
------------------------------------------------------------------------
group: 50
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 50.00 0.00 50.00 50.00 0.00 50.00 50.00 0.00 NaN NaN 0.00
Actual 2 9 5.91 0.51 5.82 5.91 0.43 5.43 6.94 1.51 0.90 -0.73 0.17
LinPred 3 9 50.40 3.98 48.77 50.40 3.21 44.89 57.37 12.48 0.49 -1.16 1.33
QuadPred 4 9 50.24 3.97 48.91 50.24 2.65 44.49 57.01 12.52 0.39 -1.21 1.32
------------------------------------------------------------------------
group: 150
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 150.00 0.00 150.00 150.00 0.00 150.00 150.00 0.00 NaN NaN 0.00
Actual 2 6 17.23 0.97 17.20 17.23 0.67 15.90 18.80 2.90 0.25 -1.23 0.39
LinPred 3 6 147.19 8.11 147.01 147.19 11.13 138.04 155.39 17.36 -0.01 -2.22 3.31
QuadPred 4 6 147.77 7.95 147.48 147.77 10.95 139.60 157.78 18.17 0.07 -2.10 3.25
------------------------------------------------------------------------
group: 250
var n mean sd median trimmed mad min max range skew kurtosis se
Nominal 1 9 250.00 0.00 250.00 250.00 0.00 250.00 250.00 0.00 NaN NaN 0.00
Actual 2 9 28.83 1.18 28.70 28.83 0.89 27.10 31.20 4.10 0.59 -0.57 0.39
LinPred 3 9 246.29 10.57 245.98 246.29 9.31 231.46 264.81 33.35 0.33 -1.26 3.52
QuadPred 4 9 251.51 8.84 248.45 251.51 5.08 240.41 268.30 27.89 0.62 -1.04 2.95
>
回答2:
colSums(!is.na(x)) should work.
回答3:
Can you use something like this?
length(unique(x))
回答4:
What are "blank values" and "text values"? If you have numeric vector then you could have NA's (is.na()), Inf's (is.infinite()), NaN's (is.nan()) and "valid" numeric values.
For "valid" numeric values (in the sense above) you could use is.finite():
is.finite(c(1,NA,Inf,NaN))
# [1] TRUE FALSE FALSE FALSE
sum( is.finite(c(1,NA,Inf,NaN)) )
# [1] 1
So colSums(is.numeric(x)) could be done like colSums(is.finite(x)).
回答5:
Does complete.cases (or sum(complete.cases)) do what you want?
来源:https://stackoverflow.com/questions/1508889/how-to-count-number-of-numeric-values-in-a-column