Delete Duplicates for Large Dataset, Both True Duplicates (Whole row is duplicate) and Duplicate Based on One Column

穿精又带淫゛_ 提交于 2020-07-23 06:49:07

问题


I have a rather large dataset. Minimum rows are in the 8K range. I need to delete duplicates on two conditions. The first would be what I call a "True Duplicate." By definition this means that the entire row is a duplicate. Here is the script I have that works for that scenario.

function removeDuplicates(sheet) {
  var data = sheet.getDataRange().getValues();
  var newData = [];
  var trueDuplicateCount = 0;

  for (var i in data) {
    var row = data[i];
    var duplicate = false;
    for (var j in newData) {
      if (row.join() == newData[j].join()) {//Look for duplicates across all rows. True Duplicate
        duplicate = true;
        trueDuplicateCount = trueDuplicateCount + 1;
      }
    }
    if (!duplicate) {
      newData.push(row);
    }
  }
  sheet.clearContents();
  sheet.getRange(1, 1, newData.length, newData[0].length).setValues(newData);
  return trueDuplicateCount;
}

The other condition would be a duplicate based on one column's info. After having removed "True Duplicates" I need to delete duplicates based on a column. I would like to keep the line with the earliest date on another column.

Here is what I tried but does not work for this scenario.

function removeDuplicates(sheet) {
  var data = sheet.getDataRange().getValues();
  var newData = [];
  var trueDuplicateCount = 0;
  var diffDateDuplicateCount = 0;

  for (var i in data) {
    var row = data[i];
    var duplicate = false;
    for (var j in newData) {
      if (row.join() == newData[j].join()) {//Look for duplicates across all rows. True Duplicate
        duplicate = true;
        trueDuplicateCount = trueDuplicateCount + 1;
      }
      if(row[1] == newData[j][1] && row[0] > newData[j][0]){
        duplicate = true;
        diffDateDuplicateCount = diffDateDuplicateCount + 1
      }
    }
    if (!duplicate) {
      newData.push(row);
    }
  }
  sheet.clearContents();
  sheet.getRange(1, 1, newData.length, newData[0].length).setValues(newData);
  return [trueDuplicateCount, diffDateDuplicateCount];
}

Here is a sample of the dataset

After Deleting the "True Duplicates"

After Deleting the duplicate with the later date

The above is what would be left after the script ran and then the function would also return an array with the count for each type of duplicate deleted.

The current script works for the True Duplicates portion, but I'm concerned about speed and possibly timing out on a larger dataset. With 8K rows I'm already seeing almost 3 minute run time. With that said, here are my asks.

Conditions

  • Speed, speed, speed. Is there a more efficient way of handling this? This is my biggest concern.
  • Must remove duplicate with the later date and keep the one with the earliest date.
  • Must return a count for each type of duplicate removed.

Hoping this clears up some confusion. I've shown what I want done with each step. (Acct Number is made up)

Comment about Solution chosen

I went with the solution that executed the fastest. While both Tanaike's and Master's worked, I went with Master's because I anticipate a lot of lines in the future. Every millisecond counts.

I just want to thank those who answered especially Tanaike who put in a lot of work. Hopefully this question becomes the holy grail for deleting duplicates because your non-v8 solution is still great for those without v8.


回答1:


You can use the inbuilt removeDuplicates method, which will remove duplicates in place. Use hash object to remove date duplicates afterwards:

Sample script:

function remDups(sheet) {
  let sh = sheet || SpreadsheetApp.getActive().getSheetByName('Sheet1');
  let rg = sh.getRange(2, 1, sh.getLastRow() - 1, 2);
  let initDataSz = rg.getNumRows();
  let newRg = rg.removeDuplicates();
  let newDataSz = newRg.getNumRows();
  //console.info({ initDataSz, newDataSz });
  let trueDups = initDataSz - newDataSz;
  let values = newRg.getValues();
  //newRg.copyTo(sh.getRange('C1'));
  newRg.clearContent();

  let out = Object.entries(
    values.reduce((obj, [date, color]) => {
      let oldDate = (obj[color] = obj[color] || Infinity);
      if (oldDate - date > 0) {
        obj[color] = date;
      }
      return obj;
    }, {})
  ).map(e => e.reverse());
  let falseDups = newDataSz - out.length;
  sh.getRange(2, 1, out.length, out[0].length).setValues(out);
  return [`${trueDups}`, `${falseDups}`];
}

Performance:

  • ~2.6 seconds for 15000 rows on V8 engine

References:

  • Range#removeDuplicates



回答2:


Try this:

function removeDuplicates(sh) {
  var v=sh.getDataRange().getValues();
  var u=[];
  var u0=[];
  var t=0;
  var t0=0;
  //var d=0;
  v.forEach(function(r,i){
    var found=false;
    //whole row match
    if(u.indexOf(r.join())==-1) {
      u.push(r.join());
    }else{
      sh.deleteRow(i+1-d++);
      t++;
      found=true;
    }
    if(!found) {
      //one column match setup for date
      var dts=Utilities.formatDate(new Date(r[0]),Session.getScriptTimeZone(), "yyyy/MM/dd");
      if(u0.indexOf(dts)==-1) {
        u0.push(dts);
      }else{
        sh.deleteRow(i+1-d++)
        t0++;
      } 
    }
  });
  return [t,t0];
}

As you said this might be faster. So give it a try.

function removeDuplicates(sh) {
  var v=sh.getDataRange().getValues();
  var u=[];
  var u0=[];
  var oA=[]
  var t=0;
  var t0=0;
  var d=0;
  v.forEach(function(r,i){
    var found=false;
    //whole row match
    if(u.indexOf(r.join())==-1) {
      u.push(r.join());
      oA.push(r);
    }else{
      //sh.deleteRow(i+1-d++);
      t++;
      found=true;
    }
    if(!found) {
      //one column match
      var dts=Utilities.formatDate(new Date(r[0]),Session.getScriptTimeZone(), "yyyy/MM/dd");
      if(u0.indexOf(dts)==-1) {
        u0.push(dts);
        oA.push(r);
      }else{
        //sh.deleteRow(i+1-d++)
        t0++;
      } 
    }
  });
  sh.clearContents();
  sh.getRange(1,1,oA.length,oA[0].length).setValues(oA);
  return [t,t0];
}



回答3:


  • You want to remove the duplicated values of date and color.
  • You want to achieve the result, which is shown as the images, in your question.
  • You want to reduce the process cost of your Google Apps Script.

If my understanding is correct, how about this answer? Please think of this as just one of several possible answers.

Flow:

  1. Retrieve values from the sheet.
  2. Create an object for retrieving trueDuplicateCount.
  3. Create an object for retrieving diffDateDuplicateCount.
  4. Create an array for putting to Spreadsheet.
  5. Put the values to the Spreadsheet.
  6. Calculate trueDuplicateCount and diffDateDuplicateCount.

Sample script:

function removeDuplicates(sheet) {
  // var sheet = SpreadsheetApp.getActiveSheet();

  // Retrieve values from the sheet.
  var data = sheet.getDataRange().getValues();
  var header = data.shift(); // Remove the header row.

  // Create an object for retrieving trueDuplicateCount.
  var object1 = data.reduce(function(o, [a, b], i) {
    var key = b + "_" + a.getTime();
    o[key] = key in o ? o[key] + 1 : 1;
    return o;
  }, {});

  // Create an object for retrieving diffDateDuplicateCount.
  var object2 = Object.keys(object1).reduce(function(o, e) {
    var [c, d] = e.split("_");
    d = Number(d);
    o[c] = c in o ? (o[c] > d ? d : o[c]) : d;
    return o
  }, {});

  // Create an array for putting to Spreadsheet.
  var ar = Object.keys(object2).map(function(e) {return [new Date(object2[e]), e]});
  ar.unshift(header);

  // Put the values to the Spreadsheet.
  sheet.clearContents();
  sheet.getRange(1, 1, ar.length, ar[0].length).setValues(ar);

  // Calculate trueDuplicateCount and diffDateDuplicateCount.
  var trueDuplicateCount = data.length - Object.keys(object1).length;
  var diffDateDuplicateCount = Object.keys(object1).length - Object.keys(object2).length;
  return [trueDuplicateCount, diffDateDuplicateCount];
}
  • In above script, it supposes that the header row is existing in the 1st row. If you don't use the header row, please remove data.shift().
  • In this case, the result values are put from the 1st row like your script. So in this case, the header row is cleared. Please be careful this.

Note:

  • In above script, the date values retrieved from the cells is used as the date object. Please be careful this.

Added:

When v8 runtime can be used, above script can be written as follows.

function removeDuplicates_v8(sheet) {
  // var sheet = SpreadsheetApp.getActiveSheet();

  // Retrieve values from the sheet.
  const data = sheet.getDataRange().getValues();
  const header = data.shift(); // Remove the header row.

  // Create an object for retrieving trueDuplicateCount.
  const object1 = data.reduce((o, [a, b], i) => ({...o, [(b + "_" + a.getTime())]: true}), {});

  // Create an object for retrieving diffDateDuplicateCount and an array for putting to Spreadsheet.
  const ar = Object.entries(Object.keys(object1).reduce((o, e) => {
    let [c, d] = e.split("_");
    d = Number(d);
    return {...o, [c]: new Date(c in o ? (o[c] > d ? d : o[c]) : d)};
  }, {})).map(([a, b]) => [b, a]);

  // Calculate trueDuplicateCount and diffDateDuplicateCount.
  const trueDuplicateCount = data.length - Object.keys(object1).length;
  const diffDateDuplicateCount = Object.keys(object1).length - ar.length;

  // Put the values to the Spreadsheet.
  sheet.clearContents();
  ar.unshift(header);
  sheet.getRange(1, 1, ar.length, ar[0].length).setValues(ar);
  return [trueDuplicateCount, diffDateDuplicateCount];
}


来源:https://stackoverflow.com/questions/60133131/delete-duplicates-for-large-dataset-both-true-duplicates-whole-row-is-duplicat

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!