您好,感谢你的开源工作。
我试着在spark-shell中加载portable包’hanlp-portable-1.7.8.jar’,并且将data放到hdfs上,配置了hanlp.properties和ioadapter。
本地进行依存句法是可以执行的,如下述代码。
import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.corpus.io.IIOAdapter
import java.io._;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.hankcs.hanlp.utility.Predefine
import java.net.URI;
Predefine.HANLP_PROPERTIES_PATH = "/data1/XXXX/file/upload/hanlp.properties"
class HadoopFileIoAdapter extends IIOAdapter {
@Override
def open(path: String): java.io.InputStream = {
val conf: Configuration = new Configuration();
val fs: FileSystem = FileSystem.get(URI.create(path), conf);
fs.open(new Path(path));
}
@Override
def create(path: String): java.io.OutputStream = {
val conf: Configuration = new Configuration();
val fs: FileSystem = FileSystem.get(URI.create(path), conf);
fs.create(new Path(path));
}
}
HanLP.Config.IOAdapter = new HadoopFileIoAdapter();
HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")
可是在udf中却无法执行?请问该如何解决呢?
import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.corpus.io.IIOAdapter
import java.io._;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.hankcs.hanlp.utility.Predefine
import java.net.URI;
Predefine.HANLP_PROPERTIES_PATH = "/data1/XXXX/file/upload/hanlp.properties"
class HadoopFileIoAdapter extends IIOAdapter {
@Override
def open(path: String): java.io.InputStream = {
val conf: Configuration = new Configuration();
val fs: FileSystem = FileSystem.get(URI.create(path), conf);
fs.open(new Path(path));
}
@Override
def create(path: String): java.io.OutputStream = {
val conf: Configuration = new Configuration();
val fs: FileSystem = FileSystem.get(URI.create(path), conf);
fs.create(new Path(path));
}
}
HanLP.Config.IOAdapter = new HadoopFileIoAdapter();
HanLP.parseDependency("徐先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。")
val keyword_hanlp = udf{(kw: String) =>
HanLP.parseDependency(kw).toString
}
val df_test = df_final.withColumn("hanlp", keyword_hanlp($"kw"))
df_test.select("kw", "hanlp").show
报错信息如下:
Caused by: java.lang.NoClassDefFoundError: Could not initialize class com.hankcs.hanlp.tokenizer.NLPTokenizer
期待你的回复。谢谢