在spark-shell中使用依存句法的问题

jimmy-walker · October 26, 2020, 4:38am

您好，感谢你的开源工作。
我试着在spark-shell中加载portable包’hanlp-portable-1.7.8.jar’，并且将data放到hdfs上，配置了hanlp.properties和ioadapter。
本地进行依存句法是可以执行的，如下述代码。

import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.corpus.io.IIOAdapter
import java.io._;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.hankcs.hanlp.utility.Predefine
import java.net.URI;
Predefine.HANLP_PROPERTIES_PATH = "/data1/XXXX/file/upload/hanlp.properties"

class HadoopFileIoAdapter extends IIOAdapter {
    @Override
    def open(path: String): java.io.InputStream = {
        val conf: Configuration = new Configuration();
        val fs: FileSystem = FileSystem.get(URI.create(path), conf);
        fs.open(new Path(path));
    }   
 
    @Override
    def create(path: String): java.io.OutputStream = {
        val conf: Configuration = new Configuration();
        val fs: FileSystem = FileSystem.get(URI.create(path), conf);
        fs.create(new Path(path));
    }
}
HanLP.Config.IOAdapter = new HadoopFileIoAdapter();
HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")

可是在udf中却无法执行？请问该如何解决呢？

import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.corpus.io.IIOAdapter
import java.io._;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.hankcs.hanlp.utility.Predefine
import java.net.URI;
Predefine.HANLP_PROPERTIES_PATH = "/data1/XXXX/file/upload/hanlp.properties"

class HadoopFileIoAdapter extends IIOAdapter {
    @Override
    def open(path: String): java.io.InputStream = {
        val conf: Configuration = new Configuration();
        val fs: FileSystem = FileSystem.get(URI.create(path), conf);
        fs.open(new Path(path));
    }   
 
    @Override
    def create(path: String): java.io.OutputStream = {
        val conf: Configuration = new Configuration();
        val fs: FileSystem = FileSystem.get(URI.create(path), conf);
        fs.create(new Path(path));
    }
}
HanLP.Config.IOAdapter = new HadoopFileIoAdapter();
HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")

val keyword_hanlp = udf{(kw: String) =>
  HanLP.parseDependency(kw).toString
}

val df_test = df_final.withColumn("hanlp", keyword_hanlp($"kw"))
df_test.select("kw", "hanlp").show

报错信息如下：

Caused by: java.lang.NoClassDefFoundError: Could not initialize class com.hankcs.hanlp.tokenizer.NLPTokenizer

期待你的回复。谢谢

Dreamliking · June 22, 2022, 11:09am

请问后面你解决了吗？我也是放到spark上分布式批跑出现的