@inproceedings{6ddd0f8115d34b86acf0a5fceaa6661d,
title = "Open-Set Semi-Supervised Text Classification with Latent Outlier Softening",
abstract = "Semi-supervised text classification (STC) has been extensively researched and reduces human annotation. However, existing research assuming that unlabeled data only contains in-distribution texts is unrealistic. This paper extends STC to a more practical Open-set Semi-supervised Text Classification (OSTC) setting, which assumes that the unlabeled data contains out-of-distribution (OOD) texts. The main challenge in OSTC is the false positive inference problem caused by inadvertently including OOD texts during training. To address the problem, we first develop baseline models using outlier detectors for hard OOD-data filtering in a pipeline procedure. Furthermore, we propose a Latent Outlier Softening (LOS) framework that integrates semi-supervised training and outlier detection within probabilistic latent variable modeling. LOS softens the OOD impacts by the Expectation-Maximization (EM) algorithm and weighted entropy maximization. Experiments on 3 created datasets show that LOS significantly outperforms baselines.",
keywords = "latent variable, semi-supervised learning, text classification",
author = "Junfan Chen and Richong Zhang and Junchi Chen and Chunming Hu and Yongyi Mao",
note = "Publisher Copyright: {\textcopyright} 2023 ACM.; 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD 2023 ; Conference date: 06-08-2023 Through 10-08-2023",
year = "2023",
month = aug,
day = "4",
doi = "10.1145/3580305.3599456",
language = "英语",
series = "Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining",
publisher = "Association for Computing Machinery ",
pages = "226--236",
booktitle = "KDD 2023 - Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining",
address = "美国",
}