Splitting a large data frame into smaller segments

后端 未结 5 794
你的背包
你的背包 2020-12-01 09:17

I have the following data frame and I want to break it up into 10 different data frames. I want to break the initial 100 row data frame into 10 data frames of 10 rows. I cou

相关标签:
5条回答
  • 2020-12-01 09:57

    If you can generate a vector that defines the groups, you can split anything:

    f <- rep(seq_len(ceiling(1123 / 200)),each = 200,length.out = 1123)
    > df1 <- split(df,f = f)
    > lapply(df1,dim)
    $`1`
    [1] 200   3
    
    $`2`
    [1] 200   3
    
    $`3`
    [1] 200   3
    
    $`4`
    [1] 200   3
    
    $`5`
    [1] 200   3
    
    $`6`
    [1] 123   3
    
    0 讨论(0)
  • 2020-12-01 10:00

    Something like this...?

    b <- seq(10, 100, 10)
    lapply(seq_along(b), function(i) df[(b-9)[i]:b[i], ])
    
    [[1]]
              one        two      three
    1  -2.4157992 -0.6232517  1.0531358
    2   0.6769020  0.3908089 -1.9543895
    3   0.9804026 -2.5167334  0.7120919
    4  -1.2200089  0.5108479  0.5599177
    5   0.4448290 -1.2885275 -0.7665413
    6   0.8431848 -0.9359947  0.1068137
    7  -1.8168134 -0.2418887  1.1176077
    8   1.4475904 -0.8010347  2.3716663
    9   0.7264027 -0.3573623 -1.1956806
    10  0.2736119 -1.5553148  0.2691115
    
    [[2]]
              one         two       three
    11 -0.3273536 -1.92475496 -0.08031696
    12  1.5558892 -1.20158371  0.09104958
    13  1.9202047 -0.13418754  0.32571632
    14 -0.0515136 -2.15669216  0.23099397
    15  0.1909732 -0.30802742 -1.28651457
    16  0.8545580 -0.18238266  1.57093844
    17  0.4903039  0.02895376 -0.47678196
    18  0.5125400  0.97052082 -0.70541908
    19 -1.9324370  0.22093545 -0.34436105
    20 -0.5763433  0.10442551 -2.05597985
    
    [[3]]
              one         two       three
    21  0.7168771 -1.22902943 -0.18728871
    22  1.2785641  0.14686576 -1.74738091
    23 -1.1856173  0.43829361  0.41269975
    24  0.0220843  1.57428924 -0.80163986
    25 -1.0012255  0.05520813  0.50871603
    26 -0.1842323 -1.61195239  0.04843504
    27  0.2328831 -0.38432225  0.95650710
    28  0.8821687 -1.32456215 -1.33367967
    29 -0.8902177  0.86414661 -1.39629358
    30 -0.6586293 -2.27325919  0.27367902
    
    [[4]]
              one        two       three
    31  1.3810437 -1.0178835  0.07779591
    32  0.6102753  0.3538498  1.92316801
    33 -1.5034439  0.7926925  2.21706284
    34  0.8251638  0.3992922  0.56781321
    35 -1.0832114  0.9878058 -0.16820827
    36 -0.4132375 -0.9214491  1.06681472
    37 -0.6787631  1.3497766  2.18327887
    38 -3.0082585 -1.3047024 -0.04913214
    39 -0.3433300  1.1008951 -2.02065141
    40  0.6009334  1.2334421  0.15623298
    
    [[5]]
              one         two       three
    41 -1.8608051 -0.08589437  0.02370983
    42 -0.1829953  0.91139017 -0.01356590
    43  1.1146731  0.42384993 -0.68717391
    44  1.9039900 -1.70218225  0.06100297
    45 -0.4851939  1.38712015 -1.30613414
    46 -0.4661664  0.23504099 -0.29335162
    47  0.5807227 -0.87821946 -0.14816121
    48 -2.0168910 -0.47657382  0.90503226
    49  2.5056404  0.27574224  0.10326333
    50  0.2238735  0.34441325 -0.17186115
    
    [[6]]
               one        two      three
    51  1.51613140 -2.5630782 -0.6720399
    52  0.03859537 -2.6688365  0.3395574
    53 -0.08695292 -0.5114117 -0.1378789
    54 -0.51878363 -0.5401962  0.3946324
    55 -2.20482710  0.1716744  0.1786546
    56 -0.28133749 -0.4497112  0.5936497
    57 -2.38269088 -0.4625695  1.0048914
    58  0.37865952  0.5055141  0.3337986
    59  0.09329172  0.1560469  0.2835735
    60 -1.10818863 -0.2618910  0.3650042
    
    [[7]]
              one        two       three
    61 -1.2507208 -1.5050083 -0.63871084
    62  0.1379394  0.7996674 -1.80196762
    63  0.1582008 -0.3208973  0.40863693
    64 -0.6224605  0.1416938 -0.47174711
    65  1.1556149 -1.4083576 -1.12619693
    66 -0.6956604  0.7994991  1.16073748
    67  0.6576676  1.4391007  0.04134445
    68  1.4610598 -1.0066840 -1.82981058
    69  1.1951788 -0.4005535  1.57256648
    70 -0.1994519  0.2711574 -1.04364396
    
    [[8]]
               one        two       three
    71  1.23897065  0.4473611 -0.35452535
    72  0.89015916  2.3747385  0.87840852
    73 -1.17339703  0.7433220  0.40232381
    74 -0.24568490 -0.4776862  1.24082294
    75 -0.47187443 -0.3271824  0.38542703
    76 -2.20899136 -1.1131712 -0.33663075
    77 -0.05968035 -0.6023045 -0.23747388
    78  1.19687199 -1.3390960 -1.37884241
    79 -1.29310506  0.3554548 -0.05936756
    80 -0.17470891  1.6198307  0.69170207
    
    [[9]]
               one         two       three
    81 -1.06792315  0.04801998  0.08166394
    82  0.84152560 -0.45793907  0.27867619
    83  0.07619456 -1.21633682 -2.51290495
    84  0.55895466 -1.01844178 -0.41887672
    85  0.33825508 -1.15061381  0.66206732
    86 -0.36041720  0.32808609 -1.83390913
    87 -0.31595401 -0.87081019  0.45369366
    88  0.92331087  1.22055348 -1.91048757
    89  1.30491142  1.22582353 -1.32244004
    90 -0.32906839  1.76467263  1.84479228
    
    [[10]]
                one        two       three
    91   2.80656707 -0.9708417  0.25467304
    92   0.35770119 -0.6132523 -1.11467041
    93   0.09598908 -0.5710063 -0.96412216
    94  -1.08728715  0.3019572 -0.04422049
    95   0.14317455  0.1452287 -0.46133199
    96  -1.00218917 -0.1360570  0.88864256
    97  -0.25316855  0.6341925 -1.37571664
    98   0.36375921  1.2244921  0.12718650
    99   0.13345555  0.5330221 -0.29444683
    100  2.28548261 -2.0413222 -0.53209956
    
    0 讨论(0)
  • 2020-12-01 10:02
     > str(split(df, (as.numeric(rownames(df))-1) %/% 200))
    List of 6
     $ 0:'data.frame':  200 obs. of  3 variables:
      ..$ one  : num [1:200] -1.592 1.664 -1.231 0.269 0.912 ...
      ..$ two  : num [1:200] 0.639 -0.525 0.642 1.347 1.142 ...
      ..$ three: num [1:200] -0.45 -0.877 0.588 1.188 -1.977 ...
     $ 1:'data.frame':  200 obs. of  3 variables:
      ..$ one  : num [1:200] -0.0017 1.9534 0.0155 -0.7732 -1.1752 ...
      ..$ two  : num [1:200] -0.422 0.869 0.45 -0.111 0.073 ...
      ..$ three: num [1:200] -0.2809 1.31908 0.26695 0.00594 -0.25583 ...
     $ 2:'data.frame':  200 obs. of  3 variables:
      ..$ one  : num [1:200] -1.578 0.433 0.277 1.297 0.838 ...
      ..$ two  : num [1:200] 0.913 0.378 0.35 -0.241 0.783 ...
      ..$ three: num [1:200] -0.8402 -0.2708 -0.0124 -0.4537 0.4651 ...
     $ 3:'data.frame':  200 obs. of  3 variables:
      ..$ one  : num [1:200] 1.432 1.657 -0.72 -1.691 0.596 ...
      ..$ two  : num [1:200] 0.243 -0.159 -2.163 -1.183 0.632 ...
      ..$ three: num [1:200] 0.359 0.476 1.485 0.39 -1.412 ...
     $ 4:'data.frame':  200 obs. of  3 variables:
      ..$ one  : num [1:200] -1.43 -0.345 -1.206 -0.925 -0.551 ...
      ..$ two  : num [1:200] -1.343 1.322 0.208 0.444 -0.861 ...
      ..$ three: num [1:200] 0.00807 -0.20209 -0.56865 1.06983 -0.29673 ...
     $ 5:'data.frame':  123 obs. of  3 variables:
      ..$ one  : num [1:123] -1.269 1.555 -0.19 1.434 -0.889 ...
      ..$ two  : num [1:123] 0.558 0.0445 -0.0639 -1.934 -0.8152 ...
      ..$ three: num [1:123] -0.0821 0.6745 0.6095 1.387 -0.382 ...
    

    If some code might have changed the rownames it would be safer to use:

     split(df, (seq(nrow(df))-1) %/% 200) 
    
    0 讨论(0)
  • 2020-12-01 10:07

    Chops df into 1 million row groups and pushes and appends a million at a time to df in SQL

    batchsize = 1000000 # vary to your liking
    
    # cycles through data by batchsize
    for (i in 1:ceiling(nrow(df)/batchsize)) 
     {
    print(i) # just to show the progress
    
    # below shows how to cycle through data 
    batch <- df[(((i-1)*batchsize)+1(batchsize*i),,drop=FALSE] # drop = FALSE keeps it from being converted to a vector 
    
    # if below not done then the last batch has Nulls above the number of rows of actual data
    batch <- batch[!is.na(batch$ID),] # ID is a variable I presume is in every row
    
    #in this case the table already existed, if new table overwrite = TRUE
    (dbWriteTable(con, "df", batch, append = TRUE,row.names = FALSE)) 
    }
    
    0 讨论(0)
  • 2020-12-01 10:10
    require(ff)
    df <- data.frame(one=c(rnorm(1123)), two=c(rnorm(1123)), three=c(rnorm(1123)))
    for(i in chunk(from = 1, to = nrow(df), by = 200)){
      print(df[min(i):max(i), ])
    }
    
    0 讨论(0)
提交回复
热议问题