@inproceedings{lorenzo2017analyzing,
  title={Analyzing the impact of including listener perception annotations in {RNN}-based emotional speech synthesis},
  author={Lorenzo-Trueba, Jaime and Henter, Gustav Eje and Takaki, Shinji and Yamagishi, Junichi},
  booktitle={IPSJ SIG Tech. Rep.},
  abstract={This paper investigates simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions? Second, how should the emotional information be represented as labels for supervised DNN training? We evaluate on a large-scale corpus of emotional speech from a professional actress, additionally annotated with perceived emotional labels from crowd-sourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates.},
  address={Tokyo, Japan},
  month={Dec.},
  publisher={IPSJ},
  volume={2017-SLP-119},
  number={8},
  pages={1--2},
  url={https://ipsj.ixsq.nii.ac.jp/ej/?action=repository\_uri\&item\_id=184864},
  year={2017}
}