IT序号网

solr 自聚类实现

shasha 2021年05月27日 编程语言 281 0

  参考官网:IT虾米网

  最近用到solr自聚类的,先简单介绍如下:

  1、配置文件

    主要配置文件必须配置如下内容:

<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" /> 
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" />

    

<searchComponent name="clustering" enable="${solr.clustering.enabled:true}" class="solr.clustering.ClusteringComponent"> 
    <!-- Lingo clustering algorithm --> 
    <lst name="engine"> 
      <str name="name">lingo</str> 
      <!--<bool name="optional">true</bool>--> 
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> 
      <str name="carrot.resourcesDir">clustering/carrot2</str> 
    </lst> 
 
    <!-- An example definition for the STC clustering algorithm. --> 
    <lst name="engine"> 
      <str name="name">stc</str> 
      <bool name="optional">true</bool> 
      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> 
      <str name="carrot.resourcesDir">clustering/carrot2</str> 
    </lst> 
 
    <lst name="engine"> 
      <str name="name">kmeans</str> 
      <!--<bool name="optional">true</bool>--> 
      <str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str> 
      <str name="carrot.resourcesDir">clustering/carrot2</str> 
    </lst> 
  </searchComponent>

    下面的配置文件根据自己的实际情况进行修改:

 <requestHandler name="/clustering" 
                  startup="lazy" 
                  class="solr.SearchHandler"> 
    <lst name="defaults"> 
      <bool name="clustering">true</bool> 
      <bool name="clustering.results">true</bool> 
 
      <!-- Field name with the logical "title" of a each document (optional) --> 
      <str name="carrot.title">keyword</str> 
      <!-- Logical field to physical field mapping. --> 
      <str name="carrot.url">id</str> 
      <!-- Field name with the logical "content" of a each document (optional) --> 
      <str name="carrot.snippet">summary</str> 
      <!-- Apply highlighter to the title/ content and use this for clustering. --> 
      <bool name="carrot.produceSummary">true</bool> 
      <!-- the maximum number of labels per cluster --> 
      <!--<int name="carrot.numDescriptions">5</int>--> 
      <!-- produce sub clusters --> 
      <bool name="carrot.outputSubClusters">false</bool> 
 
      <!-- Configure any other request handler parameters. We will cluster the 
         top 100 search results so bump up the 'rows' parameter. --> 
      <!--<str name="defType">edismax</str> 
      <str name="qf"> 
        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 
      </str> 
      <str name="q.alt">*:*</str>--> 
      <str name="defType">edismax</str> 
      <!--<str name="qf"> 
        summary^0.5 category^1.2  id^10.0 
      </str>--> 
      <str name="qf">keyword^0.5 title^1.2  id^10.0</str> 
      <str name="rows">100</str> 
      <str name="fl">*,score</str> 
    </lst> 
 
    <!-- Append clustering at the end of the list of search components. --> 
    <arr name="last-components"> 
      <str>clustering</str> 
    </arr> 
  </requestHandler>

    managed-schema配置文件包含以下内:

   

 <fieldType name="text_ik" class="solr.TextField"> 
    <analyzer type="index" class="org.wltea.analyzer.lucene.IKAnalyzer"/> 
    <analyzer type="query" class="org.wltea.analyzer.lucene.IKAnalyzer"/> 
  </fieldType> 
  <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/> 
  <field name="text" type="text_ik" multiValued="false" indexed="true" stored="true" termVectors ="true"/> 
  <field name="title" type="text_ik" multiValued="false" indexed="true" stored="true" /> 
  <field name="snippet" type="text_ik" multiValued="false" indexed="true" stored="true" /> 
  <field name="keyword" type="text_ik" multiValued="false" indexed="true" stored="true" /> 
  <field name="category" type="text_ik" multiValued="false" indexed="true" stored="true" /> 
  <field name="summary" type="text_ik" multiValued="false" indexed="true" stored="true"/> 
  <field name="path" type="string" multiValued="false" indexed="true" stored="true"/>

    注意:text_ik对应的分词组件,要引用对应的jar包,具体参见:IT虾米网

  2、测试索引的文件

    启动solr服务,在浏览器输入:http://localhost:8983/solr/mycore/clustering?q=*:*&rows=10

    结果如下:

    

  3、java查询代码

import org.apache.solr.client.solrj.SolrClient; 
import org.apache.solr.client.solrj.SolrQuery; 
import org.apache.solr.client.solrj.SolrServerException; 
import org.apache.solr.client.solrj.impl.HttpSolrClient; 
import org.apache.solr.client.solrj.response.Cluster; 
import org.apache.solr.client.solrj.response.QueryResponse; 
import org.apache.solr.client.solrj.response.ClusteringResponse; 
import org.apache.solr.common.SolrDocument; 
 
import java.io.IOException; 
import java.util.List; 
 
/** 
 * @Author:sks 
 * @Description: 
 * @Date:Created in 9:41 2018/1/18 
 * @Modified by: 
 **/ 
public class AutoCluster { 
 
    private static SolrClient solr; 
 
    /** 
     * @Author:sks 
     * @Description:初始化solr客户端 
     * @Date: 
     */ 
    public static void Init(String urlString){ 
 
        solr = new HttpSolrClient.Builder(urlString).build(); 
    } 
    public static void main(String[] args) throws SolrServerException,IOException { 
 
        String urlString = "http://localhost:8983/solr/mycore"; 
        String path = "D:/work/Solr/ImportData"; 
 
        Init(urlString); 
        getAutoClusterInfo(); 
        System.exit(0); 
    } 
 
    /** 
     * @Author:sks 
     * @Description:获取聚类数据 
     * @Date: 
     */ 
    private static void getAutoClusterInfo() throws SolrServerException,IOException { 
        //使用这个对象做查询 
        SolrQuery params = new SolrQuery(); 
        //查询所有数据 
        params.set("qt", "/clustering"); 
        params.setQuery("*:*"); 
        params.setStart(0); 
        params.setRows(30); 
 
        QueryResponse queryResponse = solr.query(params); 
        ClusteringResponse clr = queryResponse.getClusteringResponse(); 
        List<Cluster> list = clr.getClusters(); 
        //拿到聚类数据集合,返回查询结果 
 
        String  txt = ""; 
        for(Cluster c :list){ 
            //类别标签 
            List<String> lblist = c.getLabels(); 
            for(String lb:lblist){ 
                System.out.println(lb); 
            } 
            //聚类文档ID 
            List<String> doclist  = c.getDocs(); 
            for(String doc:doclist){ 
                System.out.println("        " + doc); 
            } 
        } 
 
 
    } 
 
}

    查询结果如下:

  

    


评论关闭
IT序号网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!

IntelliJ IDEA 创建 java Maven项目