@inproceedings{watts2019where, title={Where do the improvements come from in sequence-to-sequence neural {TTS}?}, author={Watts, Oliver and Henter, Gustav Eje and Fong, Jason and Valentini-Botinhao, Cassia}, booktitle={Proc. SSW}, abstract={Sequence-to-sequence neural networks with attention mechanisms have recently been widely adopted for text-to-speech. Compared with older, more modular statistical parametric synthesis systems, sequence-to-sequence systems feature three prominent innovations: 1) They replace substantial parts of traditional fixed front-end processing pipelines (like Festival's) with learned text analysis; 2) They jointly learn to align text and speech and to synthesise speech audio from text; 3) They operate autoregressively on previously-generated acoustics. Naturalness improvements have been reported relative to earlier systems which do not contain these innovations. It would be useful to know how much each of the various innovations contribute to the improved performance. We here propose one way of associating the separately-learned components of a representative older modular system, specifically Merlin, with the different sub-networks within recent neural sequence-to-sequence architectures, specifically Tacotron 2 and DCTTS. This allows us to swap in and out various components and subnets to produce intermediate systems that step between the two paradigms; subjective evaluation of these systems then allows us to isolate the perceptual effects of the various innovations. We report on the design, evaluation, and findings of such an experiment.}, keywords={speech synthesis, end-to-end, SPSS, naturalness}, address={Vienna, Austria}, month={Sept.}, publisher={ISCA}, volume={10}, pages={217--222}, doi={10.21437/SSW.2019-39}, year={2019} }