@inproceedings{yoshimura2016hierarchical,
  title={A Hierarchical Predictor of Synthetic Speech Naturalness Using Neural Networks},
  author={Yoshimura, Takenori and Henter, Gustav Eje and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi},
  booktitle={Proc. Interspeech},
  abstract={A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.},
  keywords={speech synthesis, naturalness, neural network, Blizzard Challenge},
  address={San Francisco, CA},
  month={Sept.},
  publisher={ISCA},
  volume={17},
  pages={342--346},
  doi={10.21437/Interspeech.2016-847},
  year={2016}
}