# 应用HanLP从企业经营范围到行业代码建模之一 开题

《应用HanLP从企业经营范围到行业代码建模之一 开题》

1 Like

1、我还没有私有领域的专有名词标注数据，这是需要相当大人力资源的工作，或者要外包给专业的数据标注公司进行，也是需要数据资源资金资源的工作。
2、还没有微调训练过深度学习预训练NLP模型，虽然HanLP上有教程。

``````#加载数据 ---------------------------------------------------------------------------------
import pyarrow as pa
import pyarrow.parquet as pq
import time, os, gc
import json
import psutil
import objgraph
import numpy as np
import pandas as pd
import re
from igraph import *
import matplotlib.pyplot as plt

parquet_file_path = 'D:/temp/data/工商登记/gsdj_parsed.parquet'

# global referenced data, e.g., id, com_industry, and so on.
t1 = time.time()
df = table.to_pandas()
del table
gc.collect()
t2 = time.time()
print(t2-t1)

# Compute and get the first 100 rows as a pandas DataFrame

# global cumulative results, some analysis such as word counts, and so on.
cumulative_results={'wordsCount':{},'posCount':{}, 'action':{}, 'target':{}}

# Adjust these values based on your dataset and system's memory capacity
batch_size = 100000

# Function to update cumulative results based on parsed JSON and possibly using global references
def update_cumulative_results(parsed_data, i, j):
# Update your cumulative_results using parsed_data and index of it.
start_index = i * batch_size+ j
end_index = start_index + len(parsed_data)
print(f'Processing {start_index} to {end_index-1}.')
# Do some analysis and update cumulative results.
for j in range(0,len(parsed_data)):
dic = parsed_data[j]
myanalysis(dic, start_index+j)

# print(df1.iloc[0,2])
#dic = result2dic(df1.iloc[0,2])
#dic = result2dic(df1.iloc[3,2])

# Do some analysys
def myanalysis(dic, index):
wordsCount = cumulative_results['wordsCount']
posCount = cumulative_results['posCount']
actions = cumulative_results['action']
targets = cumulative_results['target']
# 有个别的经营范围为空，或没有分析出来
try:
words = dic['tok']; poss = dic['pos']
for word, pos in zip(words, poss):
try:
wordsCount[word] += 1
except Exception as e:
wordsCount[word] = 1
try:
posCount[pos] += 1
except Exception as e:
posCount[pos] = 1
# 经营活动是动词，看看都有些什么经营活动
if re.search('V',pos):
try:
actions[word] += 1
except Exception as e:
actions[word] = 1
# 经营对象是名词，看看都有些什么经营对象
if re.search('N',pos):
try:
targets[word] += 1
except Exception as e:
targets[word] = 1
except Exception as e:
pass

# Process Parquet in batches, but only for the JSON strings column this time
t1 = time.time()
parquet_file = pq.ParquetFile(parquet_file_path)
for i, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size, columns=['parse_results'])):
table = pa.Table.from_batches([batch])
json_strs = table.column('parse_results').to_pylist()

# Inner batch processing for JSON strings, similar to before
chunk_size = 10000  # Adjust based on the complexity of JSON and available memory
for j in range(0, len(json_strs), chunk_size):
chunk = json_strs[j:j+chunk_size]
parsed_chunk = [result2dic(item) for item in chunk]

# Update cumulative results with data from this chunk
update_cumulative_results(parsed_chunk, i, j)

# Memory management
del chunk, parsed_chunk
# gc.collect()
t2 = time.time()
print(f'Batch {i} chunk {int(j/10000)} processed, {np.round(t2-t1)} seconds.')

# Memory management
del table, json_strs
gc.collect()

# 查看动词的情况
actions = pd.DataFrame({'action':cumulative_results['action'].keys(), 'count':cumulative_results['action'].values()})
actions.sort_values(by=['count'], ascending=False, inplace = True)
actions.reset_index(drop = True, inplace = True)

# 查看名词的情况
targets = pd.DataFrame({'target':cumulative_results['target'].keys(), 'count':cumulative_results['target'].values()})
targets.sort_values(by=['count'], ascending=False, inplace = True)
targets.reset_index(drop = True, inplace = True)

# 查看名词的情况
poss = pd.DataFrame({'posCount':cumulative_results['posCount'].keys(), 'count':cumulative_results['posCount'].values()})
poss.sort_values(by=['count'], ascending=False, inplace = True)
poss.reset_index(drop = True, inplace = True)

# matplotlib作图中文设置
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
plt.figure(figsize=(12, 16))#6，8分别对应宽和高
fig, ax = plt.subplots()
ax.bar(list(poss['posCount']), list(poss['count']))
ax.set_ylabel('词频')
ax.set_title('词性标注统计')
plt.xticks(rotation=90)
plt.show()

``````