How to remove repeating entries in a massive array (javascript)

问题

I'm trying to graph a huge data set (about 1.6 million points) using Kendo UI. This number is too large, but I have figured out that many of these points are repeating. The data is currently stored in this format: [ [x,y], [x,y], [x,y]...] with each x and y being a number, thus each subarray is a point. The approach I have in mind is to create a second empty array, and then loop through the very long original array, and only push each point to the new one if it isn't already found there.

I tried to use jQuery.inArray(), but it does not seem to work with the 2D array I have here.

I currently try this:

    var datMinified = [];
    for( z = 2; z < dat1.length; z++) //I start at 2 because the first 2 elements are strings, disregard this
     {

       if( !(testContains(datMinified, dat1[z])) )
       {

         datMinified.push(dat1[z])

       }
      }

with the helper functions defined as:

    function testContains(arr, val)
      {
        for(i=0;i<arr.length;i++)
        {
          if( arraysEqual( arr[i], val) )
          {
            return true;
          }
        }
        return false;
      }

and:

    function arraysEqual(arr1, arr2)
    {
      if(! (arr1.length == arr2.length))
      {
        return false;
      }
      for( i = 0; i < arr1.length; i++ )
      {
        if( !(arr1[i] == arr2[i]))
        {
          return false;
        }
      }
      return true;
    }

When I run this script, even with a smaller array of length 6 thousand it still gets stuck up. Maybe jQuery is a good solution?

Edit: I was also thinking that there might be some way to tell the browser to not time out and just sit and work through the data?

回答1:

You have a non-trivial problem but I'm gonna blast right through so ask questions if I lose you somewhere along the line. This solution does not cast the coordinate into a String or serialise it using other techniques like JSON.stringify -

Start with a way to create coordinates -

const Coord = (x, y) =>
  [ x, y ]

To demonstrate the solution, I need to construct many random coordinates -

const rand = x =>
  Math.floor(Math.random() * x)

const randCoord = x => 
  Coord(rand(x), rand(x))

console.log(randCoord(1e3))
// [ 655, 89 ]

Now we make an array of 1 million random coordinates -

const million =
  Array.from(Array(1e6), _ => randCoord(1e3))

Now we make a function to filter all of the unique values using DeepMap, a tiny module I developed in this answer.

const uniq = (coords = []) =>
{ const m = new Map
  const r = []
  for (const c of coords)
    if (!DeepMap.has(m, c))
      { DeepMap.set(m, c, true)
        r.push(c)
      }
  return r
}

Because for and DeepMap have excellent performance, uniq can identify all of the unique values in less than one second -

console.time("uniq")
const result = uniq(million)
console.timeEnd("uniq")

console.log("uniq length:", result.length)
console.log("sample:", result.slice(0,10))

// uniq: 535 ms
// uniq length: 631970
// sample: 
// [ [ 908, 719 ]
// , [ 532, 967 ]
// , [ 228, 689 ]
// , [ 942, 546 ]
// , [ 716, 180 ]
// , [ 456, 427 ]
// , [ 714, 79 ]
// , [ 315, 480 ]
// , [ 985, 499 ]
// , [ 212, 407 ]
// ]

Expand the snippet below to verify the results in your own browser -

const DeepMap =
  { has: (map, [ k, ...ks ]) =>
      ks.length === 0
        ? map.has(k)
        : map.has(k)
          ? DeepMap.has(map.get(k), ks)
          : false

  , set: (map, [ k, ...ks ], value) =>
      ks.length === 0
        ? map.set(k, value)
        : map.has(k)
            ? (DeepMap.set(map.get(k), ks, value), map)
            : map.set(k, DeepMap.set(new Map, ks, value))
  }

const Coord = (x, y) =>
  [ x, y ]

const rand = x =>
  Math.floor(Math.random() * x)

const randCoord = x => 
  Coord(rand(x), rand(x))

const million =
  Array.from(Array(1e6), _ => randCoord(1e3))

const uniq = (coords = []) =>
{ const m = new Map
  const r = []
  for (const c of coords)
    if (!DeepMap.has(m, c))
      { DeepMap.set(m, c, true)
        r.push(c)
      }
  return r
}

console.time("uniq")
const result = uniq(million)
console.timeEnd("uniq")

console.log("uniq length:", result.length)
console.log("sample:", result.slice(0,10))

// uniq: 535 ms
// uniq length: 631970
// sample: 
// [ [ 908, 719 ]
// , [ 532, 967 ]
// , [ 228, 689 ]
// , [ 942, 546 ]
// , [ 716, 180 ]
// , [ 456, 427 ]
// , [ 714, 79 ]
// , [ 315, 480 ]
// , [ 985, 499 ]
// , [ 212, 407 ]
// ]

By using generating smaller random coordinates, we can verify that uniq is generating a correct output. Below we generate coordinates up to [ 100, 100 ] for a maximum possibility of 10,000 unique coordinates. When you run the program below, because the coordinates are generated at random, it's possible that result.length will be under 10,000, but it should never exceed it - in which case we'd know an invalid (duplicate) coordinate was added -

const million =
  Array.from(Array(1e6), _ => randCoord(1e2))

console.time("uniq")
const result = uniq(million)
console.timeEnd("uniq")

console.log("uniq length:", result.length)
console.log("sample:", result.slice(0,10))

// uniq: 173 ms
// uniq length: 10000
// sample: 
// [ [ 50, 60 ]
// , [ 18, 69 ]
// , [ 87, 10 ]
// , [ 8, 7 ]
// , [ 91, 41 ]
// , [ 48, 47 ]
// , [ 78, 28 ]
// , [ 39, 12 ]
// , [ 18, 84 ]
// , [ 0, 71 ]
// ]

Expand the snippet below to verify the results in your own browser -

const DeepMap =
  { has: (map, [ k, ...ks ]) =>
      ks.length === 0
        ? map.has(k)
        : map.has(k)
          ? DeepMap.has(map.get(k), ks)
          : false

  , set: (map, [ k, ...ks ], value) =>
      ks.length === 0
        ? map.set(k, value)
        : map.has(k)
            ? (DeepMap.set(map.get(k), ks, value), map)
            : map.set(k, DeepMap.set(new Map, ks, value))
  }

const Coord = (x, y) =>
  [ x, y ]

const rand = x =>
  Math.floor(Math.random() * x)

const randCoord = x => 
  Coord(rand(x), rand(x))

const uniq = (coords = []) =>
{ const m = new Map
  const r = []
  for (const c of coords)
    if (!DeepMap.has(m, c))
      { DeepMap.set(m, c, true)
        r.push(c)
      }
  return r
}

const million =
  Array.from(Array(1e6), _ => randCoord(1e2))

console.time("uniq")
const result = uniq(million)
console.timeEnd("uniq")

console.log("uniq length:", result.length)
console.log("sample:", result.slice(0,10))

// uniq: 173 ms
// uniq length: 10000
// sample: 
// [ [ 50, 60 ]
// , [ 18, 69 ]
// , [ 87, 10 ]
// , [ 8, 7 ]
// , [ 91, 41 ]
// , [ 48, 47 ]
// , [ 78, 28 ]
// , [ 39, 12 ]
// , [ 18, 84 ]
// , [ 0, 71 ]
// ]

Lastly, I'll include the DeepMap module used here -

const DeepMap =
  { has: (map, [ k, ...ks ]) =>
      ks.length === 0
        ? map.has(k)
        : map.has(k)
          ? DeepMap.has(map.get(k), ks)
          : false

  , set: (map, [ k, ...ks ], value) =>
      ks.length === 0
        ? map.set(k, value)
        : map.has(k)
            ? (DeepMap.set(map.get(k), ks, value), map)
            : map.set(k, DeepMap.set(new Map, ks, value))

  , get: (map, [ k, ...ks ]) =>
    // ...

  , entries: function* (map, fields = [])
    // ...
  }

For a complete implementation, see the linked Q&A. Fwiw, I do think you will find the link interesting as it provides more context for the complexity of this problem.

回答2:

You could try something like this. Probably would be helpful to do some benchmarking, or consider doing is server side. That is a lot of data, and you probably are going to see most browser hang:

points = ["test", "string", [1,1], [1,2],[1,3],[1,4],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7],[1,8],[2,1],[2,1],[2,2],[1,1],[1,1],[1,1],[1,1],[1,1]];
t={};
unique = points.filter(e=>!(t[e]=e in t));
console.log(unique);

回答3:

UPDATE

In short: You can use a Set to automatically create a collection of unique values (which is what differentiates Set from Map), if these values are in a suitable (e.g. comparable) format:

let collection = new Set(data.map((point) => point.toString()));
collection = [...collection].map((val) => val.split(','));

these two lines are enough to filter your 1 million + array to unique values in about 1 second. For a lengthier explanation, see the third example =)...

Original Answer

jQuery is mainly for DOM manipulation and helping with (older) browser quirks, not for dealing with big data! So, no, I would not recommend that, plus it will slow down your processing even more...question is, can you use modern JS (e.g. generator functions) in your app or does it have to work in older browsers as well?

I'm not sure how this will go performance wise with 1+ million entries, but let me know how this works (where data is your datMinified of course):

const data = [
    'string',
    'string',
    [1, 2],
    [1, 2],
    [2, 3],
    [3, 4],
    [3, 4],
    [4, 5],
];

data.splice(0, 2); // remove your 2 strings at the beginning

console.time('filtering with reduce');
let collection = data.reduce((acc, val) => {
    const pointstr = val.toString();
    if ( !acc.includes(pointstr) ) {
        acc.push(pointstr);
    }

    return acc;
}, []);
collection.map((point) => point.split(','));
console.timeEnd('filtering with reduce');
console.log(`filtered data has ${collection.length} entries!`);

a generator function could help you to keep memory consumption down (maybe?) =), and it would spare you the .map() part at the end of the above example:

console.time('filtering with generator');
function* filter(arr) {
    let filtered = [];
    for (var i = 0, l = arr.length; i < l; i++ ) {
        const pointstr = arr[i].toString();
        if ( !filtered.includes(pointstr) ) {
            filtered.push(pointstr);
            yield arr[i];
        }
    }
}
let collection = [];
for (let point of filter(data)) {
    collection.push(point);
}
console.timeEnd('filtering with generator');
console.log(`filtered data has ${collection.length} entries!`);

EDIT

both of the above are horrible in terms of performance, here is a realistic scenario for your use case with 1'000'000 data points and a significant improvement based on @user633183 's suggestion to use a Set or Map. I chose to use a set because it represents a collection of unique values, which is exactly what we want, e.g. it takes automatically care of the filtering for us (if the data is in the right form to identify duplicates of course):

const randomBetween = (min,max) => Math.floor(Math.random()*(max-min+1)+min);

var data = Array(1000000);
for (var i = data.length; i; data[--i] = [randomBetween(1,1000), randomBetween(1, 1000)]);

console.log(`unfiltered data has ${data.length} entries!`);

console.time('filtering');

// create the Set with unique values by adding them as strings
// like that the Set will automatically filter duplicates
let collection = new Set(data.map((point) => point.toString()));

console.log(`filtered data has ${collection.size} entries!`);

// we still have to revert the toString() process here
// but we operate on the automatically filtered collection of points
// and this is fast!
collection = [...collection].map((val) => val.split(','));
console.log(`resulting data has ${collection.length} entries!`);
console.timeEnd('filtering');

thanks again @user633183, learned something today =)!

another option would be to combine the generator function with a Set like this:

console.time('filtering with generator and Set');
function* filterSet(arr) {
    let filtered = new Set();
    for (var i = 0, l = arr.length; i < l; i++ ) {
        const pointstr = arr[i].toString();
        if ( !filtered.has(pointstr) ) {
            filtered.add(pointstr);
            yield arr[i];
        }
    }
}
let collection = [];
for (let point of filterSet(data)) {
    collection.push(point);
}
console.timeEnd('filtering with generator and Set');
console.log(`filtered data has ${collection.length} entries!`);

this again spares you from having to reverse the .toString() and is just slightly faster than the "direct" new Set() approach.

To finish this up, here a completely subjective benchmark on my machine with 100'000 data points:

unfiltered data has 100000 entries!
filtering with reduce: 31946.634ms
filtered data has 95232 entries!
filtering with generator: 39533.802ms
filtered data has 95232 entries!
filtering with generator and Set: 107.893ms
filtered data has 95232 entries!
filtering with Set: 159.894ms
filtered data has 95232 entries!

来源：https://stackoverflow.com/questions/56894538/how-to-remove-repeating-entries-in-a-massive-array-javascript

标签

javascript

asynchronous

bigdata