@inproceedings{luong2017adapting, title={Adapting and Controlling {DNN}-based Speech Synthesis Using Input Codes}, author={Luong, Hieu-Thi and Takaki, Shinji and Henter, Gustav Eje and Yamagishi, Junichi}, booktitle={Proc. ICASSP}, abstract={Methods for adapting and controlling the characteristics of output speech are important topics in speech synthesis. In this work, we investigated the performance of DNN-based text-to-speech systems that in parallel to conventional text input also take speaker, gender, and age codes as inputs, in order to 1) perform multi-speaker synthesis, 2) perform speaker adaptation using small amounts of target-speaker adaptation data, and 3) modify synthetic speech characteristics based on the input codes. Using a large-scale, studio-quality speech corpus with 135 speakers of both genders and ages between tens and eighties, we performed three experiments: 1) First, we used a subset of speakers to construct a DNN-based, multi-speaker acoustic model with speaker codes. 2) Next, we performed speaker adaptation by estimating code vectors for new speakers via backpropagation from a small amount of adaptation material. 3) Finally, we experimented with manually manipulating input code vectors to alter the gender and/or age characteristics of the synthesised speech. Experimental results show that high-performance multi-speaker models can be constructed using the proposed code vectors with a variety of encoding schemes, and that adaptation and manipulation can be performed effectively using the codes.}, keywords={speech synthesis, DNNs, speaker adaptation, speech manipulation, voice morphing}, address={New Orleans, LA}, month={Mar.}, publisher={IEEE}, volume={42}, pages={4905--4909}, doi={10.1109/ICASSP.2017.7953089}, year={2017} }