package cn.com.duiba.nezha.compute.common.util

import java.util


import cn.com.duiba.nezha.compute.common.dict.CategoryFeatureDict

import scala.collection.Map
import org.apache.spark.rdd.RDD
import scala.collection.JavaConverters._

/**
 * Created by pc on 2016/12/15.
 */
object CategoryFeatureUtil {


  /**
   * 对字段进行去重、索引、映射
   *
   * @param data
   * @param idx
   */
  def getFeatureMapping(data: RDD[List[String]], idx: Int): Map[String, Long] = {
    val map = data.map(line => line(idx).toLowerCase).distinct().filter(_ != null).zipWithIndex().collectAsMap()
    map
  }

  def getFeature(data: RDD[List[String]], idx: Int): Seq[String] = {
    val dataLower = data.map(line => line(idx).toLowerCase).cache()

    val dataFilter = dataLower.distinct().filter(_ != null).cache()

    val map = dataFilter.zipWithIndex().collectAsMap()
    map.keys.toSeq.sorted
  }

  def getFeatureWithSep(data: RDD[List[String]], idx: Int, sep: String): Seq[String] = {

    val map = data.map(line => line(idx).toLowerCase).flatMap(w => w.split(sep)).filter(_ != null).distinct().zipWithIndex().collectAsMap()
    map.keys.toSeq.sorted
  }


  def getFeatureDict(data: RDD[List[String]], featureIdxList: List[String], featuerIdxLocMap: Map[String, Int]): CategoryFeatureDict = {
    val dict: CategoryFeatureDict = new CategoryFeatureDict()

    val dictMap = new util.HashMap[String, java.util.List[String]]

    featureIdxList.map(featureIdx => {
      val idx = featuerIdxLocMap(featureIdx)
      val categoryList = getFeature(data, idx)
      dictMap.put(featureIdx, categoryList.asJava)
    })

    dict.setFeatureDict(dictMap)
    dict
  }


}