@inproceedings{lorenzo2017analyzing, title={Analyzing the impact of including listener perception annotations in {RNN}-based emotional speech synthesis}, author={Lorenzo-Trueba, Jaime and Henter, Gustav Eje and Takaki, Shinji and Yamagishi, Junichi}, booktitle={IPSJ SIG Tech. Rep.}, abstract={This paper investigates simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions? Second, how should the emotional information be represented as labels for supervised DNN training? We evaluate on a large-scale corpus of emotional speech from a professional actress, additionally annotated with perceived emotional labels from crowd-sourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates.}, address={Tokyo, Japan}, month={Dec.}, publisher={IPSJ}, volume={2017-SLP-119}, number={8}, pages={1--2}, url={https://ipsj.ixsq.nii.ac.jp/ej/?action=repository\_uri\&item\_id=184864}, year={2017} }