Trouble-shooting Box Cox transformation in R ( need to use for loop or apply)

杀马特。学长 韩版系。学妹 提交于 2021-02-17 05:31:09

问题


Please find below my data ( rows are disease group 0= control, 1=Ulcerative Colitis and 2=Crohns), columns are gene expression values.

     structure(c(5.54312e-05, 5.6112e-06, 9.74312e-05, 1.3612e-06, 
     1.29312e-05, 7.2512e-06, 0.0002159302, 3.6312e-06, 0.0001467552, 
     1.53312e-05, 0.0009132182, 1.9312e-06, 0.0074214952, 0.0006480372, 
     5.1312e-06, 6.1812e-06, 4.7612e-06, 0.0001199302, 0.0008845182, 
     0.0008506632, 0.0002366382, 7.3912e-06, 8.5112e-06, 2.63312e-05, 
     0.0013685242, 1.12312e-05, 0.0001775992, 0.0063385632, 0.0061628972, 
    0.0406951632, 0.0132550862, 0.0330866502, 0.0741588422, 0.0049675282, 
    0.0124742612, 0.0432014482, 0.0114703162, 0.0384477822, 0.0188251552, 
    0.0277018382, 0.0633737932, 0.0053745442, 0.0488762832, 0.0099598792, 
    0.0044341092, 0.0041768872, 0.0152889442, 0.0602260842, 0.0512892512, 
    0.0065575852, 0.0174603572, 0.0076848152, 0.0021076082, 0.0057732232, 
    0.0761864242, 0.0376310742, 0.0521594242, 0.0121793962, 0.0471997972, 
    0.0224588692, 0.0302616442, 0.0062663212, 0.0286649272, 0.0228584812, 
    0.0280185812, 0.0176817072, 0.0405636232, 0.0297912062, 0.0347780872, 
    0.0193185042, 0.0118479432, 0.0096142082, 0.0640275732, 0.0353341802, 
    0.0416389862, 0.0560150452, 0.0330486812, 0.0176602362, 0.0301871972, 
    0.0579195622, 0.0299905202, 0.0001129152, 0.0009209172, 0.0010817792, 
    0.0001951902, 0.0016784762, 0.0001716432, 0.0001917332, 0.0005600662, 
    0.0003840872, 0.0004548142, 0.0007234162, 0.0002039282, 0.0009733682, 
    0.0008222022, 0.0006205572, 0.0002608002, 0.0002146382, 0.0020774742, 
    0.0006584612, 0.0004037032, 0.0003786822, 0.0004093372, 0.0017226182, 
    0.0002138162, 0.0001766742, 0.0020229092, 0.0018869602, 0.0530292672, 
    0.0225949962, 0.0119676672, 0.0268511442, 0.0377380112, 0.0313562992, 
    0.1041032912, 0.0632652472, 0.0180284852, 0.1160380322, 0.0057282012, 
    0.0536359992, 0.0591269722, 0.0118352722, 0.0046396552, 0.0143029422, 
    0.0829488842, 0.0152022692, 0.0212954622, 0.0433420312, 0.0081537062, 
    0.0156137782, 0.0432896402, 0.0488343522, 0.0191447942, 0.0598099022, 
    0.0069907162, 0.0408296912, 0.0298613812, 0.0614052022, 0.0061426502, 
    0.0097676332, 0.0354280242, 0.0372933212, 0.0130974212, 0.0022172112, 
    0.0402114242, 0.0063038722, 0.0301466432, 0.0320339102, 0.0245904292, 
    0.0779917522, 0.0172156972, 0.0147311782, 0.0480258512, 0.0316871712, 
    0.0324477412, 0.0322786442, 0.0173019162, 0.0134506982, 0.0402077862, 
    0.0426696462, 0.0345675212, 0.0346313502, 1.93312e-05, 4.7512e-06, 
    5.41312e-05, 3.12e-08, 1.91312e-05, 9.642e-07, 6.0112e-06, 3.12e-08, 
    1.0812e-06, 4.412e-07, 7.72312e-05, 4.382e-07, 0.0005851852, 
    0.0002470232, 7.8912e-06, 3.1612e-06, 2.1712e-06, 7.5912e-06, 
    9.29312e-05, 0.0001160552, 5.51312e-05, 7.8212e-06, 6.6812e-06, 
    2.0912e-06, 0.0001043732, 4.1912e-06, 1.27312e-05, 0.0001975332, 
    0.0001513812, 0.0001073372, 6.54312e-05, 0.0002255952, 0.0001426622, 
    0.0001689042, 3.50312e-05, 0.0003652732, 0.0001742852, 0.0003393582, 
    8.70312e-05, 0.0001367102, 0.0001566652, 0.0002242122, 0.0002053362, 
    8.87312e-05, 0.0003058052, 0.0001336462, 0.0001512112, 0.0001072602, 
    0.0001626102, 0.0001522802, 6.88312e-05, 0.0001138952, 0.0002492892, 
    0.0002425912, 0.0007929912, 0.0076409822, 0.0049373582, 0.0004223922, 
    0.0009535442, 0.0009512182, 0.0006713372, 0.0011064372, 0.0026065992, 
    0.0030068982, 0.0019116772, 0.0013541412, 0.0124617692, 0.0004349482, 
    0.0023764912, 0.0078575922, 0.0004369202, 0.0004881912, 0.0003481772, 
    0.0009314802, 0.0003240052, 0.0049453522, 0.0006938762, 0.0004796032, 
    0.0008434462, 0.0014197062, 0.0015475092, 8.16312e-05, 6.63312e-05, 
    0.0001016142, 3.08312e-05, 0.0001470702, 5.13312e-05, 0.0001095102, 
    2.39312e-05, 0.0002255062, 4.28312e-05, 0.0002308162, 2.10312e-05, 
    0.0001356312, 0.0001242042, 0.0002451592, 0.0002754772, 3.18312e-05, 
    0.0001751912, 0.0001802232, 0.0002467002, 0.0003787392, 4.35312e-05, 
    0.0002678552, 7.20312e-05, 7.65312e-05, 8.79312e-05, 0.0001300572, 
    0.0001114932, 3.17312e-05, 0.0002001272, 3.1512e-06, 8.75312e-05, 
    3.1412e-06, 6.9212e-06, 0.0001659672, 5.98312e-05, 0.0002013862, 
    5.9512e-06, 2.57312e-05, 2.53312e-05, 3.27312e-05, 0.0001374772, 
    0.0001344332, 6.172e-07, 3.90312e-05, 0.0188869402, 0.0503434972, 
    4.15312e-05, 1.67312e-05, 0.0001726452, 4.95312e-05, 1.27312e-05, 
    9.85312e-05, 4.28312e-05, 0.0027084332, 0.0032156172, 0.0045711912, 
    0.0017135802, 0.0243532152, 0.0066607792, 0.0031989182, 0.0030944172, 
    0.0047891942, 0.0028169862, 0.0215873442, 0.0020847562, 0.0037806512, 
    0.0217515262, 0.0090971742, 0.0122162562, 0.0011257962, 0.0130435652, 
    0.0055148042, 0.0083239932, 0.0268987952, 0.0021491662, 0.0080216542, 
    0.0066735982, 0.0053911702, 0.0185785902, 0.0137863282, 0.0008059812, 
    0.0012895362, 0.0024514472, 0.0002341382, 0.0016947642, 0.0002882062, 
    0.0023575092, 0.0008561602, 0.0015975512, 0.0001175692, 0.0001666122, 
    0.0003673192, 0.0010398722, 0.0017795592, 0.0004381232, 0.0010125462, 
    0.0005299672, 0.0031931172, 0.0025627332, 0.0027740412, 0.0030131672, 
    0.0013492282, 0.0016463272, 0.0011142532, 0.0012079132, 0.0028049802, 
    0.0003664502), .Dim = c(27L, 13L), .Dimnames = list(c("2", "0", 
   "0", "0", "1", "0", "0", "1", "1", "1", "2", "0", "0", "1", "2", 
   "2", "1", "2", "2", "2", "2", "1", "1", "2", "2", "0", "0"), 
   c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6", "Gene7", 
   "Gene8", "Gene9", "Gene10", "Gene11", "Gene12", "Gene13")))

I used the bestNormalize package to apply the Box-Cox transformation to the individual columns (when converting them to vectors) e.g.

       values <- boxcox(data[, 1], standardize=T)
       normvalues<- predict(values)

when inspecting the values in column 1 following Box-Cox transformation on the whole matrix using

        process <-boxcox(data, standardize=T)
        norm <- predict(process)

the values are not the same. Whilst neither achieves normality ( looking at histograms and ad.test), how do I know which is the correct one? I understand according to the help pages of bestNormalize boxcox(x, ...), x needs to be a vector.

If I need to apply the Box-Cox to each individual vector of data ( columns 1:13) rather than the whole matrix, I am lost as to how I would devise a for loop or use the apply function for this.

Any suggestions would be appreciated please. My attempt at currently non-functioning loops are below:

    for(i in 1:ncol(data)){
    normvalues <- apply(data[,i], 
    Margin=2, 
    FUN=function()
    {process <- boxcox(data[, i], standardize=T)
    normout <- predict(process[i])
    print(normout)}
    }

Or using pipes?

   for(i in 1:ncol(stacknew)){
   normalcheck5 <- stacknew[,i]
   %>% boxcox()
   %>%  predict()
   print(normalcheck5)
   }

I need the output in a 27 x 13 matrix containing the values of the transformed gene expression where the transformation has been applied to each column vector individually. Any suggestions would be helpful please.


回答1:


I would solve your problem as follows.

# packages
library(bestNormalize)
library(tidyr)
library(ggplot2)
library(dplyr)
library(stringr)

# data
my_data <- structure(c(
  5.54312e-05, 5.6112e-06, 9.74312e-05, 1.3612e-06,
  1.29312e-05, 7.2512e-06, 0.0002159302, 3.6312e-06, 0.0001467552,
  1.53312e-05, 0.0009132182, 1.9312e-06, 0.0074214952, 0.0006480372,
  5.1312e-06, 6.1812e-06, 4.7612e-06, 0.0001199302, 0.0008845182,
  0.0008506632, 0.0002366382, 7.3912e-06, 8.5112e-06, 2.63312e-05,
  0.0013685242, 1.12312e-05, 0.0001775992, 0.0063385632, 0.0061628972,
  0.0406951632, 0.0132550862, 0.0330866502, 0.0741588422, 0.0049675282,
  0.0124742612, 0.0432014482, 0.0114703162, 0.0384477822, 0.0188251552,
  0.0277018382, 0.0633737932, 0.0053745442, 0.0488762832, 0.0099598792,
  0.0044341092, 0.0041768872, 0.0152889442, 0.0602260842, 0.0512892512,
  0.0065575852, 0.0174603572, 0.0076848152, 0.0021076082, 0.0057732232,
  0.0761864242, 0.0376310742, 0.0521594242, 0.0121793962, 0.0471997972,
  0.0224588692, 0.0302616442, 0.0062663212, 0.0286649272, 0.0228584812,
  0.0280185812, 0.0176817072, 0.0405636232, 0.0297912062, 0.0347780872,
  0.0193185042, 0.0118479432, 0.0096142082, 0.0640275732, 0.0353341802,
  0.0416389862, 0.0560150452, 0.0330486812, 0.0176602362, 0.0301871972,
  0.0579195622, 0.0299905202, 0.0001129152, 0.0009209172, 0.0010817792,
  0.0001951902, 0.0016784762, 0.0001716432, 0.0001917332, 0.0005600662,
  0.0003840872, 0.0004548142, 0.0007234162, 0.0002039282, 0.0009733682,
  0.0008222022, 0.0006205572, 0.0002608002, 0.0002146382, 0.0020774742,
  0.0006584612, 0.0004037032, 0.0003786822, 0.0004093372, 0.0017226182,
  0.0002138162, 0.0001766742, 0.0020229092, 0.0018869602, 0.0530292672,
  0.0225949962, 0.0119676672, 0.0268511442, 0.0377380112, 0.0313562992,
  0.1041032912, 0.0632652472, 0.0180284852, 0.1160380322, 0.0057282012,
  0.0536359992, 0.0591269722, 0.0118352722, 0.0046396552, 0.0143029422,
  0.0829488842, 0.0152022692, 0.0212954622, 0.0433420312, 0.0081537062,
  0.0156137782, 0.0432896402, 0.0488343522, 0.0191447942, 0.0598099022,
  0.0069907162, 0.0408296912, 0.0298613812, 0.0614052022, 0.0061426502,
  0.0097676332, 0.0354280242, 0.0372933212, 0.0130974212, 0.0022172112,
  0.0402114242, 0.0063038722, 0.0301466432, 0.0320339102, 0.0245904292,
  0.0779917522, 0.0172156972, 0.0147311782, 0.0480258512, 0.0316871712,
  0.0324477412, 0.0322786442, 0.0173019162, 0.0134506982, 0.0402077862,
  0.0426696462, 0.0345675212, 0.0346313502, 1.93312e-05, 4.7512e-06,
  5.41312e-05, 3.12e-08, 1.91312e-05, 9.642e-07, 6.0112e-06, 3.12e-08,
  1.0812e-06, 4.412e-07, 7.72312e-05, 4.382e-07, 0.0005851852,
  0.0002470232, 7.8912e-06, 3.1612e-06, 2.1712e-06, 7.5912e-06,
  9.29312e-05, 0.0001160552, 5.51312e-05, 7.8212e-06, 6.6812e-06,
  2.0912e-06, 0.0001043732, 4.1912e-06, 1.27312e-05, 0.0001975332,
  0.0001513812, 0.0001073372, 6.54312e-05, 0.0002255952, 0.0001426622,
  0.0001689042, 3.50312e-05, 0.0003652732, 0.0001742852, 0.0003393582,
  8.70312e-05, 0.0001367102, 0.0001566652, 0.0002242122, 0.0002053362,
  8.87312e-05, 0.0003058052, 0.0001336462, 0.0001512112, 0.0001072602,
  0.0001626102, 0.0001522802, 6.88312e-05, 0.0001138952, 0.0002492892,
  0.0002425912, 0.0007929912, 0.0076409822, 0.0049373582, 0.0004223922,
  0.0009535442, 0.0009512182, 0.0006713372, 0.0011064372, 0.0026065992,
  0.0030068982, 0.0019116772, 0.0013541412, 0.0124617692, 0.0004349482,
  0.0023764912, 0.0078575922, 0.0004369202, 0.0004881912, 0.0003481772,
  0.0009314802, 0.0003240052, 0.0049453522, 0.0006938762, 0.0004796032,
  0.0008434462, 0.0014197062, 0.0015475092, 8.16312e-05, 6.63312e-05,
  0.0001016142, 3.08312e-05, 0.0001470702, 5.13312e-05, 0.0001095102,
  2.39312e-05, 0.0002255062, 4.28312e-05, 0.0002308162, 2.10312e-05,
  0.0001356312, 0.0001242042, 0.0002451592, 0.0002754772, 3.18312e-05,
  0.0001751912, 0.0001802232, 0.0002467002, 0.0003787392, 4.35312e-05,
  0.0002678552, 7.20312e-05, 7.65312e-05, 8.79312e-05, 0.0001300572,
  0.0001114932, 3.17312e-05, 0.0002001272, 3.1512e-06, 8.75312e-05,
  3.1412e-06, 6.9212e-06, 0.0001659672, 5.98312e-05, 0.0002013862,
  5.9512e-06, 2.57312e-05, 2.53312e-05, 3.27312e-05, 0.0001374772,
  0.0001344332, 6.172e-07, 3.90312e-05, 0.0188869402, 0.0503434972,
  4.15312e-05, 1.67312e-05, 0.0001726452, 4.95312e-05, 1.27312e-05,
  9.85312e-05, 4.28312e-05, 0.0027084332, 0.0032156172, 0.0045711912,
  0.0017135802, 0.0243532152, 0.0066607792, 0.0031989182, 0.0030944172,
  0.0047891942, 0.0028169862, 0.0215873442, 0.0020847562, 0.0037806512,
  0.0217515262, 0.0090971742, 0.0122162562, 0.0011257962, 0.0130435652,
  0.0055148042, 0.0083239932, 0.0268987952, 0.0021491662, 0.0080216542,
  0.0066735982, 0.0053911702, 0.0185785902, 0.0137863282, 0.0008059812,
  0.0012895362, 0.0024514472, 0.0002341382, 0.0016947642, 0.0002882062,
  0.0023575092, 0.0008561602, 0.0015975512, 0.0001175692, 0.0001666122,
  0.0003673192, 0.0010398722, 0.0017795592, 0.0004381232, 0.0010125462,
  0.0005299672, 0.0031931172, 0.0025627332, 0.0027740412, 0.0030131672,
  0.0013492282, 0.0016463272, 0.0011142532, 0.0012079132, 0.0028049802,
  0.0003664502
), .Dim = c(27L, 13L), .Dimnames = list(
  c(
    "2", "0",
    "0", "0", "1", "0", "0", "1", "1", "1", "2", "0", "0", "1", "2",
    "2", "1", "2", "2", "2", "2", "1", "1", "2", "2", "0", "0"
  ),
  c(
    "Gene1", "Gene2", "Gene3", "Gene4", "Gene5", "Gene6", "Gene7",
    "Gene8", "Gene9", "Gene10", "Gene11", "Gene12", "Gene13"
  )
))

# At the moment we are working with a matrix, i.e.
class(my_data)
#> [1] "matrix"

# but we need a data.frame, so
my_data <- as.data.frame(my_data)

# Apply boxcox transformatio to all columns and extract transformed data
my_data_boxcox <- lapply(my_data, function(x) {
  boxcox_transformation <- boxcox(x)
  transformed_data <- boxcox_transformation$x.t
  transformed_data
})

# and format as a data.frame
result <- as.data.frame(do.call("cbind", my_data_boxcox))

The previous code is important, the following is just a graphical check

# plot original data
pivot_longer(my_data, everything()) %>% 
  mutate(name = factor(name, levels = str_sort(unique(name), numeric = TRUE))) %>% 
  ggplot() + 
  geom_histogram(aes(x = value), bins = 10) + 
  facet_wrap(vars(name), scales = "free")

# plot after boxcox trans
pivot_longer(result, everything()) %>% 
  mutate(name = factor(name, levels = str_sort(unique(name), numeric = TRUE))) %>% 
  ggplot() + 
  geom_histogram(aes(x = value), bins = 10) + 
  facet_wrap(vars(name), scales = "free")

Created on 2020-03-08 by the reprex package (v0.3.0)



来源:https://stackoverflow.com/questions/60591514/trouble-shooting-box-cox-transformation-in-r-need-to-use-for-loop-or-apply

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!