@inproceedings{malisz2019ukspeech, title={Modern speech synthesis and its implications for speech sciences}, author={Malisz, Zofia and Henter, Gustav Eje and Valentini-Botinhao, Cassia and Watts, Oliver and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. UK Speech}, abstract={Speech technology (e.g., speech synthesis) and speech sciences (e.g., phonetics) depend on an ongoing dialogue that benefits both fields. Insights into speech production, like source-filter separation, and perception, like the mel scale, were for example central in the development of classical formant-based synthesis technology and remain important also today. Speech sciences have also contributed towards advanced synthetic-speech evaluation methods. In return, milestones in phonetics such as evidence for categorical perception as well as advances like the motor theory of speech perception and acoustic cue analysis have relied on support from experiments on synthesised speech. However, in recent decades the two fields have grown apart: Speech technologists have primarily pursued increasingly natural-sounding synthesis, relinquishing precise output control in the process. Speech scientists and phoneticians, meanwhile, have remained reliant on legacy synthesisers, since only these provide the careful output control necessary for phonetic studies. Unfortunately, a body of research has over the years identified substantial perceptual differences between natural speech and classical formant synthesis, casting doubt on speech-science findings from synthetic speech. Recently, breakthroughs in deep learning have fuelled a rapid acceleration of speech-technology capabilities. In this work, we argue that modern speech synthesis with deep learning in fact has the potential to address both of the two key concerns of speech scientists -- control and realism -- by 1) bringing back precise control over synthetic-speech output and 2) significantly closing the perceptual gap between natural and synthetic speech. Both claims find support in recent research in speech-synthesis technology. We supplement our two claims with an empirical evaluation contrasting classic rule-based formant synthesis (OVE III) against state-of-the-art synthesis methods, specifically speech-in-speech-out copy synthesis (MagPhase and Griffin-Lim), DNN-based statistical parametric text-to-speech (Merlin), and sequence-to-sequence neural TTS (DCTTS). The systems are compared in terms of subjective naturalness ratings as well as on a behavioural measure (response times in a lexical decision task). We find that all modern methods vastly improve on formant synthesis naturalness and are rated above OVE III at least 99\% of the time. Moreover, response times for copy-synthesis and Merlin are found not to differ notably from response times to natural speech, meaning that the troubling processing gap of older systems (including OVE III) is no longer evident. In light of these findings and the parallel advances in synthesis control, the time is ripe for phonetics researchers to consider what modern speech-synthesis technology can do for their research problems.}, address={Birmingham, UK}, month={June}, volume={8}, pages={11--12}, url={http://www.thespeechark.com/ukspeech2019/abstractBook\_UKSpeech2019\_fin\_noLogos.pdf}, year={2019} }