@inproceedings{de60752a4ab24cd488265311f7c06353,
title = "TopicOcean: An ever-increasing topic model with meta-learning",
abstract = "Topic modeling has been intensively studied and widely applied in both academia and industry in the last decade. In the literature, topic models usually need to be trained from scratch for each individual corpus. Hence, the wisdom of the crowd (i.e., topic models previously trained based upon other corpora) is abandoned. Since a massive amount of in-domain data, considerable computational cost, and human labour are involved in obtaining a high-quality topic model, training from scratch for each new corpus is a huge waste of resources. In this paper, we propose the novel TopicOcean framework, which aims to integrate well-trained topic models and transfer the knowledge of accumulated topics to new corpora in order to improve the quality of their topic models. We first propose a method of constructing the ever-increasing TopicOcean, and then propose a meta-learning mechanism that transfers the meta-level knowledge (i.e., topics) in TopicOcean to the scenario of topic modeling on new corpora. Comprehensive experiments validate that the TopicOcean framework can significantly outperform the state-of-the-art (53.77\% perplexity improvement on a temporal-shift corpus and 29.24\% improvement on a domain-shift corpus). The well-trained high-quality topic models used to construct TopicOcean have been opensourced to promote further research. 11The well-trained topic models can be accessed at Github (https://github.com/baidu/Familia/blob/master/model/download-model.sh).",
keywords = "Meta-learning, Text semantics, Topic modeling",
author = "Yuanfeng Song and Yongxin Tong and Siqi Bao and Di Jiang and Hua Wu and Wong, \{Raymond Chi Wing\}",
note = "Publisher Copyright: {\textcopyright} 2020 IEEE.; 20th IEEE International Conference on Data Mining, ICDM 2020 ; Conference date: 17-11-2020 Through 20-11-2020",
year = "2020",
month = nov,
doi = "10.1109/ICDM50108.2020.00161",
language = "英语",
series = "Proceedings - IEEE International Conference on Data Mining, ICDM",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1262--1267",
editor = "Claudia Plant and Haixun Wang and Alfredo Cuzzocrea and Carlo Zaniolo and Xindong Wu",
booktitle = "Proceedings - 20th IEEE International Conference on Data Mining, ICDM 2020",
address = "美国",
}