WeightedCorePredicate Implementation for ELKI - An example

问题

I've recently tried to implement an example of the Weighted DBSCAN in ELKI by modifying the CorePredicate (For example, using the MinPointCorePredicate as the base to build on) and I was just wondering if anyone could critique whether this would be the right implementation in this situation:


import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.*;
import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.KMeansLloyd;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.model.KMeansModel;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.StaticArrayDatabase;
import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.datasource.ArrayAdapterDatabaseConnection;
import de.lmu.ifi.dbs.elki.datasource.DatabaseConnection;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.LoggingConfiguration;
import de.lmu.ifi.dbs.elki.datasource.DatabaseConnection;
import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.RandomlyChosenInitialMeans;
//import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.algorithm.clustering.DBSCAN;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.RadialBasisFunctionKernelFunction;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.EpsilonNeighborPredicate;
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate;
import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;

// Imports for generalized dbscan
import de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.utilities.ELKIServiceLoader;



public class SampleELKI2 {

    public static void main(String[] args) {

        double[][] data = new double[1000][3]; // The third column refers to the weights
        for (int i = 0; i < data.length; i++) {
            for (int j = 0; j < data[i].length; j++) {
                data[i][j] = Math.random();
                //System.out.println(i + " and " + j + " and " + data[i][j]);
            }
            //System.out.println(i + " and " + data[i][0] + " " + data[i][1] + " " + data[i][2]);
        }

        // Adapter to load data from an existing array
        DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
        //  Create a database (which may contain multiple relations
        Database db = new StaticArrayDatabase(dbc, null);
        // Load the data into the database (do NOT forget to initialize)
        db.initialize();
        // Relation containing the number vectors
        Relation<NumberVector> rel = db.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
        // We know that the ids must be a continuous range
        DBIDRange ids = (DBIDRange) rel.getDBIDs();
        SquaredEuclideanDistanceFunction dist = SquaredEuclideanDistanceFunction.STATIC;
        // Default initialization, using global random:
        // To fix the random seed, use: new RandomFactory(seed);
        // Compute the neighbourhood and core predicates here for generalized gdbscan
        // --------------------------------------------------------------------- //
        EpsilonNeighborPredicate ENP = new EpsilonNeighborPredicate(0.3, dist); // Generic Neighbourhoodpredicate
        WeightedCorePredicate WCP = new WeightedCorePredicate(330, db, 2); // WeightedCorePredicate with the db  (db) and column index variable containing the weights (2)
        // The Implementation of the predicates in the GDBSCAN - predicates can be replaced for conditionals
        Clustering<Model> result = new GeneralizedDBSCAN(ENP, WCP, false).run(db);
        int i = 0;

        for (Cluster<Model> clu : result.getAllClusters()) {
            for (DBIDIter it = clu.getIDs().iter(); it.valid(); it.advance()) {
                NumberVector v = rel.get(it);
                final int offset = ids.getOffset(it);
            }
            ++i;
        }
    }
}

The new WeightedCorePredicate looks something like this, and comes from a slight modification of the MinPtCorePredicate class in the ELKI source files.

public class WeightedCorePredicate implements CorePredicate<DBIDs> {

   /**
   * Class logger.
   */
  public static final Logging LOG = Logging.getLogger(MinPtsCorePredicate.class);

  /**
   * The minpts parameter.
   */
  protected int minpts;
  static Database db;
  static int WeightColumn;
  static int WeightSum;
    /**
   * Default constructor.
   *
   * @param minpts Minimum number of neighbors to be a core point.
   */
  public WeightedCorePredicate(int minpts, Database db, int WeightColumn) {
    super();
    this.minpts = minpts;
    this.db = db;
    this.WeightColumn = WeightColumn;
  }

  @Override
  public Instance instantiate(Database database) {
      return new Instance(minpts, db, WeightColumn);
  }

  @Override
  public boolean acceptsType(SimpleTypeInformation<? extends DBIDs> type) {
    return TypeUtil.DBIDS.isAssignableFromType(type) //
        || TypeUtil.NEIGHBORLIST.isAssignableFromType(type);
  }

  /**
   * Instance for a particular data set.
   *
   * @author Erich Schubert
   */
  public static class Instance implements CorePredicate.Instance<DBIDs> {
    /**
     * The minpts parameter.
     */
    protected int minpts;
    protected Database db;
    protected int WeightColumn;
    protected double WeightSum;
    /**
     * Constructor for this predicate.
     *
     * @param minpts MinPts parameter
     */
    public Instance(int minpts, Database db, int WeightColumn) {
      super();
      this.minpts = minpts;
      this.db = db;
      this.WeightColumn = WeightColumn;
    }

    @Override
    public boolean isCorePoint(DBIDRef point, DBIDs neighbors) {
    db.initialize(); // Initialize database 
    Relation<NumberVector> rel = db.getRelation(TypeUtil.NUMBER_VECTOR_FIELD); // db relation to get to the datapoints 
    WeightSum = 0; // Make sure to initialize the weights as 0 
    // DBIDS contain the indices of the points - so just need a database relation to access the points at the index 

    for (DBIDIter it = neighbors.iter(); it.valid(); it.advance()) {
        //    System.out.print("The weights are " + rel.get(it).doubleValue(WeightColumn) + "\n");
        WeightSum += rel.get(it).doubleValue(WeightColumn); // Sum the weights 
    }

    return WeightSum >= minpts;
    }
  }
    
  /**
   * Parameterization class
   *
   * @author Erich Schubert
   */
  public static class Parameterizer extends AbstractParameterizer {
    /**
     * Minpts value
     */
    protected int minpts;

    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      // Get the minpts parameter
      IntParameter minptsP = new IntParameter(DBSCAN.Parameterizer.MINPTS_ID) //
          .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
      if(config.grab(minptsP)) {
        minpts = minptsP.intValue();
        if(minpts <= 2) {
          LOG.warning("DBSCAN with minPts <= 2 is equivalent to single-link clustering at a single height. Consider using larger values of minPts.");
        }
      }
    }

    @Override
    protected WeightedCorePredicate makeInstance() {
    return new WeightedCorePredicate(minpts, db, WeightColumn);
    }
  }
}

Essentially, I've added inputs in the WeightedCorePredicate, which references the Database for which I can use indices to pick out the elements of the db from the rel and this.WeightColumn to pick out the column where the weights are listed along with the X/Y columns. This follows on from the discussion pointed here: Elki GDBSCAN Java/Scala - how to modify the CorePredicate and sample_weight option in the ELKI implementation of DBSCAN.

Any feedback regarding this would be much appreciated. I do not come from a Java background and mostly am from Python/Scala so I completely get that it is not the most elegant Java code.

Thanks

回答1:

Do not use static variables for stuff that should be local or instance variables!

They are in 99.99% of cases the wrong thing to do.

C.f., Why are static variables considered evil?

来源：https://stackoverflow.com/questions/65204069/weightedcorepredicate-implementation-for-elki-an-example

标签

java

dbscan

elki