@inproceedings{luong2017adapting,
  title={Adapting and Controlling {DNN}-based Speech Synthesis Using Input Codes},
  author={Luong, Hieu-Thi and Takaki, Shinji and Henter, Gustav Eje and Yamagishi, Junichi},
  booktitle={Proc. ICASSP},
  abstract={Methods for adapting and controlling the characteristics of output speech are important topics in speech synthesis. In this work, we investigated the performance of DNN-based text-to-speech systems that in parallel to conventional text input also take speaker, gender, and age codes as inputs, in order to 1) perform multi-speaker synthesis, 2) perform speaker adaptation using small amounts of target-speaker adaptation data, and 3) modify synthetic speech characteristics based on the input codes. Using a large-scale, studio-quality speech corpus with 135 speakers of both genders and ages between tens and eighties, we performed three experiments: 1) First, we used a subset of speakers to construct a DNN-based, multi-speaker acoustic model with speaker codes. 2) Next, we performed speaker adaptation by estimating code vectors for new speakers via backpropagation from a small amount of adaptation material. 3) Finally, we experimented with manually manipulating input code vectors to alter the gender and/or age characteristics of the synthesised speech. Experimental results show that high-performance multi-speaker models can be constructed using the proposed code vectors with a variety of encoding schemes, and that adaptation and manipulation can be performed effectively using the codes.},
  keywords={speech synthesis, DNNs, speaker adaptation, speech manipulation, voice morphing},
  address={New Orleans, LA},
  month={Mar.},
  publisher={IEEE},
  volume={42},
  pages={4905--4909},
  doi={10.1109/ICASSP.2017.7953089},
  year={2017}
}