Splitting database column into multivalued Solr field

泪湿孤枕 提交于 2019-12-13 05:59:07

问题


I'm going nuts trying to figure out how to get the Data Import Handler's splitBy construct to work. I was expecting it to split the input column into a multivalued field. Here's a test case to reproduce the problem:

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;

import static org.junit.Assert.*;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.core.CoreContainer;
import org.hsqldb.jdbc.JDBCDataSource;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class TestSplitBy {

  SolrServer server;
  File configPath = new File(FileUtils.getTempDirectory(), Long.toString(System.nanoTime()));

  String solrconfig_xml = "<config><luceneMatchVersion>LUCENE_41</luceneMatchVersion><requestHandler name=\"search\" class=\"solr.SearchHandler\" default=\"true\"><lst name=\"defaults\"><str name=\"fl\">*</str><str name=\"df\">id</str></lst></requestHandler><requestHandler name=\"/dataimport\" class=\"org.apache.solr.handler.dataimport.DataImportHandler\"><lst name=\"defaults\"><str name=\"config\">data-config.xml</str></lst></requestHandler></config>";

  String data_config_xml = "<dataConfig>" +
      "<dataSource url=\"jdbc:hsqldb:mem:testdb\" user=\"SA\" driver=\"org.hsqldb.jdbc.JDBCDriver\" />" +
      "<document>" +
      "<entity name=\"item\" transformer=\"RegexTransformer\" query=\"SELECT * FROM test\">" +
      "<field column=\"type\" name=\"type\" splitBy=\",\" />" +
      "</entity>" +
      "</document>" +
      "</dataConfig>";

  String schema_xml = "<schema version=\"1.3\" name=\"test\">" +
      "<types>" +
      "<fieldType name=\"string\" class=\"solr.StrField\" sortMissingLast=\"true\" omitNorms=\"true\" />" +
      "</types>" +
      "<fields>" +
      "<field stored=\"true\" name=\"id\" type=\"string\" />" +
      "<field stored=\"true\" name=\"type\" type=\"string\" multiValued=\"true\"/>" +
      "</fields>" +
      "<uniqueKey>id</uniqueKey>" +
      "</schema>";

  DataSource getDataSource() {
    JDBCDataSource ds = new JDBCDataSource();
    ds.setUser("SA");
    ds.setUrl("mem:testdb");
    return ds;
  }

  void populateDb(DataSource ds) {
    QueryRunner runner = new QueryRunner(ds);
    try {
      runner.update("DROP TABLE test IF EXISTS");
      runner.update("CREATE TABLE test(id INTEGER, type VARCHAR(256));");
      runner.update("INSERT INTO test VALUES 1, 'foo,bar,baz'");
    } catch (SQLException e) {
      System.err.println(e);
    }
  }

  void writeSolrConfig() throws IOException {
    File corePath = new File(configPath, "collection1");
    corePath.mkdir();
    File confPath = new File(corePath, "conf");
    confPath.mkdir();
    FileUtils.write(new File(confPath, "data-config.xml"), data_config_xml);
    FileUtils.write(new File(confPath, "schema.xml"), schema_xml);
    FileUtils.write(new File(confPath, "solrconfig.xml"), solrconfig_xml);
  }

  void startSolr() {
    System.setProperty("solr.solr.home", configPath.getAbsolutePath());
    CoreContainer.Initializer initializer = new CoreContainer.Initializer();
    CoreContainer coreContainer = initializer.initialize();
    server = new EmbeddedSolrServer(coreContainer, "collection1");
  }

  @Before
  public void setup() throws IOException {
    populateDb(getDataSource());
    writeSolrConfig();
    startSolr();
  }

  @After
  public void tearDown() {
    server.shutdown();
    FileUtils.deleteQuietly(configPath);
  }

  @Test
  public void testSplitBy() throws Exception {
    SolrQuery query = new SolrQuery();
    query.set("qt", "/dataimport");
    query.setParam("command", "full-import");
    QueryResponse response = server.query(query);
    Thread.sleep(500);

    response = server.query(new SolrQuery("*:*"));
    for (SolrDocument doc: response.getResults()) {
      assertNotNull(doc.getFieldValues("type"));
      assertEquals(3, doc.getFieldValues("type").size());
    }
  }
}

And the POM for the test case:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.example</groupId>
  <artifactId>solr</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>Solr Sanity</name>

  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.6</source>
          <target>1.6</target>
        </configuration>
      </plugin>
    </plugins>
  </build>

  <dependencies>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr</artifactId>
      <version>4.1.0</version>
      <type>war</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-dataimporthandler</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-solrj</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>commons-dbutils</groupId>
      <artifactId>commons-dbutils</artifactId>
      <version>1.5</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.hsqldb</groupId>
      <artifactId>hsqldb</artifactId>
      <version>2.2.9</version>
      <type>jar</type>
      <scope>runtime</scope>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
    </dependency>
    <dependency>
      <groupId>javax.servlet</groupId>
      <artifactId>servlet-api</artifactId>
      <version>2.5</version>
    </dependency>
  </dependencies>
</project>

Any insight on how to get those types to split correctly into multiple values?


回答1:


Turns out there were a couple of issues with this unit test:

  1. HSQL's column names are case sensitive (and default to upper case).

  2. If the Solr field name and the db column name are identical an extra token with the entire db value is also added.

  3. The field definition should look like:

<field column="solrField" splitBy="," sourceColName="TYPE" />

And in general - when using the RegexTransformer to mix single valued fields from a DB with multivalued fields:

  • If using splitBy then the column attribute is the name of the Solr field. The sourceColName is the database column

  • If not using splitBy then the column attribute is the database column name and the name attribute is the Solr field.



来源:https://stackoverflow.com/questions/14512055/splitting-database-column-into-multivalued-solr-field

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!