@inproceedings{henter2017nonparametric, title={Non-parametric duration modelling for speech synthesis with a joint model of acoustics and duration}, author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and King, Simon}, booktitle={IEICE Tech. Rep.}, abstract={We describe a new approach to duration modelling for statistical parametric speech synthesis, in which a statistical model is trained to output a phone transition probability at each time unit. Unlike conventional duration modelling -- which assumes that duration distributions have a particular shape and use the mean of that distribution for synthesis -- our approach can in principle model any distribution supported on the positive integers. Generation from this model can be performed in many ways; here we consider output generation based on the median or other quantiles of the predicted duration. Compared to conventional mean durations, the median is more typical (more probable), is robust to training-data irregularities, and enables incremental generation. Furthermore, our approach is consistent with a longer-term goal of modelling durations and acoustic features together. Results indicate that the proposed method is competitive with baseline approaches in approximating the median duration of held-out natural speech. We also discuss extensions that allow iterative realignment and adjusting the global speech rate.}, keywords={text-to-speech, speech synthesis, duration modelling, non-parametric models, LSTMs}, address={Tokyo, Japan}, month={Jan.}, publisher={IEICE}, volume={116}, number={414}, pages={11--16}, url={https://www.ieice.org/ken/paper/20170121lbp6/}, year={2017} }