This page lists all of my publications. Alternatively, you can visit my profiles on Google Scholar, SemanticScholar, and Tilburg University. Since the field of computational linguistics is conference-driven, most of my work is published through proceedings in the ACL Anthology.
For my recent book on content analysis, see this page.
2025
van Miltenburg, Emiel
Open Press Tilburg University, 2025, ISBN: 9789403769509.
@book{werkboek_inhoudsanalyse,
title = {Werkboek Inhoudsanalyse},
author = {Emiel van Miltenburg},
url = {https://tiu.trialanderror.org/projects/werkboek-inhoudsanalyse},
isbn = {9789403769509},
year = {2025},
date = {2025-05-15},
publisher = {Open Press Tilburg University},
abstract = {Werkboek Inhoudsanalyse is een praktische handleiding om te leren hoe je op een betrouwbare en verantwoordelijke manier kunt analyseren hoe mensen met elkaar communiceren. Dit boek richt zich op het ontwerpen en uitvoeren van studies die gebruik maken van kwantitatieve inhoudsanalyse als onderzoeksmethode. Deze eerste editie is ontwikkeld in het kader van de opleiding Communicatie- en Informatiewetenschappen (CIW) van Tilburg University. Binnen één semester schrijven studenten een onderzoeksvoorstel en voeren ze het voorstel vervolgens ook uit. Deze groepsopdracht is volledig uitgewerkt in de bijlagen van het boek.},
keywords = {},
pubstate = {published},
tppubtype = {book}
}
Miltenburg, Emiel
2025, (This article contains my response to a question submitted by a member of the public. The AI helpdesk contacted me to answer this question.).
@misc{2b10dce55be94ce2855e2c908d119e27,
title = {Wat zou een grotere prioriteit moeten krijgen om Large Language Models (LLM’s) zoals ChatGPT veilig en betrouwbaar verder te ontwikkelen?},
author = {Emiel Miltenburg},
url = {https://ikhebeenvraagoverai.nl/answers/wat-zou-een-grotere-prioriteit-moeten-krijgen-om-large-language-models-llms-zoals-chatgpt-veilig-en-betrouwbaar-verder-te-ontwikkelen/},
year = {2025},
date = {2025-02-11},
urldate = {2025-02-11},
publisher = {Utrecht University},
abstract = {De volledige vraag is: Wat zou een grotere prioriteit moeten krijgen om Large Language Models (LLM’s) zoals ChatGPT veilig en betrouwbaar verder te ontwikkelen: Meer onderzoek naar het ontstaan en de betekenis van hallucinaties en confabulaties door LLM’s? Of meer onderzoek naar methoden en nieuwe technologie om het ontstaan van confabulaties en hallucinaties in LLM’s volledig tegen te gaan? Deze vraag is gebaseerd op vijf onzekere aannames: 1. Dat hallucinaties schadelijk zijn. Dit is context-afhankelijk. 2. Dat hallucinaties door grote taalmodellen volledig te vermijden zijn. Dat is een onhaalbaar ideaal. 3. Dat de twee oplossingen (beter begrijpen hoe hallucinaties ontstaan en het ontwikkelen van technologie om dit tegen te gaan) onafhankelijk van elkaar zijn. Maar zonder begrip van het probleem los je niets op. 4. Dat er een objectieve keuze te maken valt tussen de twee genoemde oplossingen. 5. Dat grote taalmodellen een onvermijdelijke technologische ontwikkeling zijn. Verschillende onderzoekers stellen dat deze taalmodellen onethisch, fundamenteel beperkt, of allebei zijn.},
note = {This article contains my response to a question submitted by a member of the public. The AI helpdesk contacted me to answer this question.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Miltenburg, Emiel
Dual use issues in the field of Natural Language Generation Diversen
2025.
@misc{c2d7a18caa5448c997e8011c00b124d4,
title = {Dual use issues in the field of Natural Language Generation},
author = {Emiel Miltenburg},
url = {https://arxiv.org/abs/2501.06636},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-01},
abstract = {This report documents the results of a recent survey in the SIGGEN community, focusing on Dual Use issues in Natural Language Generation (NLG). SIGGEN is the Special Interest Group (SIG) of the Association for Computational Linguistics (ACL) for researchers working on NLG. The survey was prompted by the ACL executive board, which asked all SIGs to provide an overview of dual use issues within their respective subfields. The survey was sent out in October 2024 and the results were processed in January 2025. With 23 respondents, the survey is presumably not representative of all SIGGEN members, but at least this document offers a helpful resource for future discussions.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2024
Liebrecht, Christine; Miltenburg, Emiel; Hooijdonk, Charlotte; Kunneman, Florian; Merckens, Anouk; Niessen, Nik
Hoe halen chatbots de kink uit de kabel?: Reparatiestrategieën bij onbegrip in een chatbotgesprek Tijdschriftartikel
In: Tijdschrift voor Communicatiewetenschap, vol. 52, nr. 3, pp. 288–325, 2024, ISSN: 1384-6930, (Publisher Copyright: © Christine Liebrecht, Emiel van Miltenburg, Charlotte van Hooijdonk, Florian Kunneman, Anouk Merckens & Nik Niessen.).
@article{f24a72452638457c8d4234be1d6ff2e8,
title = {Hoe halen chatbots de kink uit de kabel?: Reparatiestrategieën bij onbegrip in een chatbotgesprek},
author = {Christine Liebrecht and Emiel Miltenburg and Charlotte Hooijdonk and Florian Kunneman and Anouk Merckens and Nik Niessen},
doi = {10.5117/TCW2024.3.003.LIEB},
issn = {1384-6930},
year = {2024},
date = {2024-07-01},
journal = {Tijdschrift voor Communicatiewetenschap},
volume = {52},
number = {3},
pages = {288–325},
publisher = {Uitgeverij Boom},
abstract = {Chatbots worden steeds vaker ingezet in de klantenservice, maar zijn verre van foutloos. Wanneer chatbots fouten maken, zijn er verschillende reparatiestrategieën om het onbegrip te communiceren. Dit artikel geeft een overzicht van de literatuur over dit onderwerp, en presenteert twee experimentele studies waaruit blijkt dat chatbots onbegrip beter met een tegemoetkomende reparatiestrategie kunnen communiceren dan met een defensieve strategie.},
note = {Publisher Copyright: © Christine Liebrecht, Emiel van Miltenburg, Charlotte van Hooijdonk, Florian Kunneman, Anouk Merckens & Nik Niessen.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Miltenburg, Emiel; Braggaar, Anouck; Braun, Nadine; Goudbeek, Martijn; Krahmer, Emiel; Lee, Chris; Pauws, Steffen; Tomas, Frédéric
ReproHum: 0033-03: How Reproducible Are Fluency Ratings of Generated Text? A Reproduction of August et al. 2022 Proceedings Article
In: Balloccu, Simone; Belz, Anya; Huidrom, Rudali; Reiter, Ehud; Sedoc, Joao; Thomson, Craig (Ed.): Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024, pp. 132–144, ELRA and ICCL, 2024.
@inproceedings{9101c41e10c246f58319d02c5828e511,
title = {ReproHum: 0033-03: How Reproducible Are Fluency Ratings of Generated Text? A Reproduction of August et al. 2022},
author = {Emiel Miltenburg and Anouck Braggaar and Nadine Braun and Martijn Goudbeek and Emiel Krahmer and Chris Lee and Steffen Pauws and Frédéric Tomas},
editor = {Simone Balloccu and Anya Belz and Rudali Huidrom and Ehud Reiter and Joao Sedoc and Craig Thomson},
url = {https://aclanthology.org/2024.humeval-1.13/},
year = {2024},
date = {2024-05-00},
urldate = {2024-05-00},
booktitle = {Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024},
pages = {132–144},
publisher = {ELRA and ICCL},
abstract = {In earlier work, August et al. (2022) evaluated three different Natural Language Generation systems on their ability to generate fluent, relevant, and factual scientific definitions. As part of the ReproHum project (Belz et al., 2023), we carried out a partial reproduction study of their human evaluation procedure, focusing on human fluency ratings. Following the standardised ReproHum procedure, our reproduction study follows the original study as closely as possible, with two raters providing 300 ratings each. In addition to this, we carried out a second study where we collected ratings from eight additional raters and analysed the variability of the ratings. We successfully reproduced the inferential statistics from the original study (i.e. the same hypotheses were supported), albeit with a lower inter-annotator agreement. The remainder of our paper shows significant variation between different raters, raising questions about what it really means to reproduce human evaluation studies.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
Wat als Greta Monach Amerikaans was geweest?: Over keuzes, toeval, en de reproduceerbaarheid van digitale poëzie Tijdschriftartikel
In: Neerlandistiek.nl, 2024, ISSN: 1567-6633.
@article{be541d7424b242d89af625fa48a17a11,
title = {Wat als Greta Monach Amerikaans was geweest?: Over keuzes, toeval, en de reproduceerbaarheid van digitale poëzie},
author = {Emiel Miltenburg},
url = {https://neerlandistiek.nl/2024/02/wat-als-greta-monach-amerikaans-was-geweest/},
issn = {1567-6633},
year = {2024},
date = {2024-02-19},
urldate = {2024-02-19},
journal = {Neerlandistiek.nl},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Miltenburg, Emiel
Willekeurige gedichten Tijdschriftartikel
In: Neerlandistiek.nl, 2024, ISSN: 1567-6633.
@article{8d692b69ae83473193492749d25c885e,
title = {Willekeurige gedichten},
author = {Emiel Miltenburg},
url = {https://neerlandistiek.nl/2024/01/willekeurige-gedichten/},
issn = {1567-6633},
year = {2024},
date = {2024-01-03},
urldate = {2024-01-03},
journal = {Neerlandistiek.nl},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Braggaar, Anouck; Kunneman, Florian; Miltenburg, Emiel
Analyzing Patterns of Conversational Breakdown in Human-Chatbot Customer Service Conversations Proceedings Article
In: 2024.
@inproceedings{3534f3298ece4d998d8bf41d150fd1ba,
title = {Analyzing Patterns of Conversational Breakdown in Human-Chatbot Customer Service Conversations},
author = {Anouck Braggaar and Florian Kunneman and Emiel Miltenburg},
url = {https://2024.conversations.ws/wp-content/uploads/2024/11/conv24_fp_25_braggaar.pdf},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
abstract = {Many chatbots still struggle with correctly interpreting and responding to user enquiries. Therefore, it is important to figure out how and why chatbot-human conversations break down. In this study we analyzed features in user-utterances directly before a bot-initiated repair to determine their presence and prominence as possible predictors of conversational breakdowns. For this study we used data from a real-life public transport customer service chatbot, showing the errors that occur in actual deployed systems. The analysis shows that there are some features (such as commonness, outdated words, and unexpected words) that occur more often in utterances directly before a repair. Some features also correlate with each other and occur together, such as outdated words and subjectivity. By using feature analysis, many opportunities for improvement can be found either live (during the interaction) or afterwards},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
Image captioning in different languages Diversen
2024.
@misc{a9e29fbf7a384771a8782b8de51353e4,
title = {Image captioning in different languages},
author = {Emiel Miltenburg},
url = {https://arxiv.org/abs/2407.09495},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
abstract = {This short position paper provides a manually curated list of non-English image captioning datasets (as of May 2024). Through this list, we can observe the dearth of datasets in different languages: only 23 different languages are represented. With the addition of the Crossmodal-3600 dataset (Thapliyal et al., 2022, 36 languages) this number increases somewhat, but still this number is small compared to the +/-500 institutional languages that are out there. This paper closes with some open questions for the field of Vision & Language.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2023
Backus, Ad; Cohen, Michael; Cohn, Neil; Faber, Myrthe; Krahmer, Emiel; Laparle, Schuyler; Maier, Emar; Miltenburg, Emiel; Roelofsen, Floris; Sciubba, Eleonora; Scholman, Merel; Shterionov, Dimitar; Sie, Maureen; Tomas, Frédéric; Vanmassenhove, Eva; Venhuizen, Noortje; Vos, Connie
Minds: Big questions for linguistics in the age of AI Tijdschriftartikel
In: Linguistics in the Netherlands, vol. 40, nr. 1, pp. 301–308, 2023, ISSN: 0929-7332.
@article{4b75df35be0b496bae9b63543a3909b8,
title = {Minds: Big questions for linguistics in the age of AI},
author = {Ad Backus and Michael Cohen and Neil Cohn and Myrthe Faber and Emiel Krahmer and Schuyler Laparle and Emar Maier and Emiel Miltenburg and Floris Roelofsen and Eleonora Sciubba and Merel Scholman and Dimitar Shterionov and Maureen Sie and Frédéric Tomas and Eva Vanmassenhove and Noortje Venhuizen and Connie Vos},
doi = {10.1075/avt.00094.bac},
issn = {0929-7332},
year = {2023},
date = {2023-11-03},
journal = {Linguistics in the Netherlands},
volume = {40},
number = {1},
pages = {301–308},
publisher = {John Benjamins Publishing Company},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Rasenberg, Marlou; Amha, Azeb; Coler, Matt; Koppen, Marjo; Miltenburg, Emiel; Rijk, Lynn; Stommel, Wyke; Dingemanse, Mark
Reimagining language: Towards a better understanding of language by including our interactions with non-humans Tijdschriftartikel
In: Linguistics in the Netherlands, vol. 40, nr. 1, pp. 309–317, 2023, ISSN: 0929-7332, (Publisher Copyright: © 2023 Algemene Vereniging voor Taalwetenschap.).
@article{5ede145510f34305b056fd26a3f4def8,
title = {Reimagining language: Towards a better understanding of language by including our interactions with non-humans},
author = {Marlou Rasenberg and Azeb Amha and Matt Coler and Marjo Koppen and Emiel Miltenburg and Lynn Rijk and Wyke Stommel and Mark Dingemanse},
doi = {10.1075/avt.00095.ras},
issn = {0929-7332},
year = {2023},
date = {2023-11-03},
journal = {Linguistics in the Netherlands},
volume = {40},
number = {1},
pages = {309–317},
publisher = {John Benjamins Publishing Company},
abstract = {What is language and who or what can be said to have it? In this essay we consider this question in the context of interactions with non-humans, specifically: animals and computers. While perhaps an odd pairing at first glance, here we argue that these domains can offer contrasting perspectives through which we can explore and reimagine language. The interactions between humans and animals, as well as between humans and computers, reveal both the essence and the boundaries of language: from examining the role of sequence and contingency in human-animal interaction, to unravelling the challenges of natural interactions with "smart"speakers and language models. By bringing together disparate fields around foundational questions, we push the boundaries of linguistic inquiry and uncover new insights into what language is and how it functions in diverse non-humanexclusive contexts.},
note = {Publisher Copyright: © 2023 Algemene Vereniging voor Taalwetenschap.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Miltenburg, Emiel; Braggaar, Anouck; Braun, Nadine; Damen, Debby; Goudbeek, Martijn; Lee, Chris; Tomas, Frédéric; Krahmer, Emiel
How reproducible is best-worst scaling for human evaluation? A reproduction of `Data-to-text Generation with Macro Planning' Proceedings Article
In: Belz, Anya; Popović, Maja; Reiter, Ehud; Thomson, Craig; Sedoc, João (Ed.): Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems, pp. 75–88, Incoma Ltd., Shoumen, Bulgaria, 2023.
@inproceedings{5a7ed974c0ce491ea6824138a00530f3,
title = {How reproducible is best-worst scaling for human evaluation? A reproduction of `Data-to-text Generation with Macro Planning'},
author = {Emiel Miltenburg and Anouck Braggaar and Nadine Braun and Debby Damen and Martijn Goudbeek and Chris Lee and Frédéric Tomas and Emiel Krahmer},
editor = {Anya Belz and Maja Popović and Ehud Reiter and Craig Thomson and João Sedoc},
url = {https://aclanthology.org/2023.humeval-1.7/},
year = {2023},
date = {2023-09-00},
urldate = {2023-09-00},
booktitle = {Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems},
pages = {75–88},
publisher = {Incoma Ltd., Shoumen, Bulgaria},
abstract = {This paper is part of the larger ReproHum project, where different teams of researchers aim to reproduce published experiments from the NLP literature. Specifically, ReproHum focuses on the reproducibility of human evaluation studies, where participants indicate the quality of different outputs of Natural Language Generation (NLG) systems. This is necessary because without reproduction studies, we do not know how reliable earlier results are. This paper aims to reproduce the second human evaluation study of Puduppully Lapata (2021), while another lab is attempting to do the same. This experiment uses best-worst scaling to determine the relative performance of different NLG systems. We found that the worst performing system in the original study is now in fact the best performing system across the board. This means that we cannot fully reproduce the original results. We also carry out alternative analyses of the data, and discuss how our results may be combined with the other reproduction study that is carried out in parallel with this paper.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
Resource papers as registered reports: a proposal Tijdschriftartikel
In: Northern European Journal of Language Technology, vol. 9, nr. 1, pp. 1–6, 2023, ISSN: 2000-1533.
@article{3fa1bb2e662b410cb9cb9a6ec1a5bbe7,
title = {Resource papers as registered reports: a proposal},
author = {Emiel Miltenburg},
doi = {10.3384/nejlt.2000-1533.2023.4884},
issn = {2000-1533},
year = {2023},
date = {2023-07-14},
journal = {Northern European Journal of Language Technology},
volume = {9},
number = {1},
pages = {1–6},
abstract = {This is a proposal for publishing resource papers as registered reports in the Northern European Journal of Language Technology. The idea is that authors write a data collection plan with a full data statement, to the extent that it can be written before data collection starts. Once the proposal is approved, publication of the final resource paper is guaranteed, as long as the data collection plan is followed (modulo reasonable changes due to unforeseen circumstances). This proposal changes the reviewing process from an antagonistic to a collaborative enterprise, and hopefully encourages NLP resources to develop and publish more high-quality datasets. The key advantage of this proposal is that it helps to promote responsible resource development (through constructive peer review) and to avoid research waste.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Huynh, Minh Hien; Lentz, Tomas; Miltenburg, Emiel
Implicit causality in GPT-2: a case study Proceedings Article
In: Amblard, Maxime; Breitholtz, Ellen (Ed.): Proceedings of the 15th International Conference on Computational Semantics, pp. 67–77, Association for Computational Linguistics, 2023.
@inproceedings{378aa60f485a448cb58a9aaf9f56cc0c,
title = {Implicit causality in GPT-2: a case study},
author = {Minh Hien Huynh and Tomas Lentz and Emiel Miltenburg},
editor = {Maxime Amblard and Ellen Breitholtz},
url = {https://aclanthology.org/2023.iwcs-1.7/},
year = {2023},
date = {2023-06-00},
urldate = {2023-06-00},
booktitle = {Proceedings of the 15th International Conference on Computational Semantics},
pages = {67–77},
publisher = {Association for Computational Linguistics},
abstract = {This case study investigates the extent to which a language model (GPT-2) is able to capture native speakers' intuitions about implicit causality in a sentence completion task. Study 1 reproduces earlier results (showing that the model's surprisal values correlate with the implicit causality bias of the verb; Davis and van Schijndel 2021), and then examine the effects of gender and verb frequency on model performance. Study 2 examines the reasoning ability of GPT-2: Is the model able to produce more sensible motivations for why the subject VERBed the object if the verbs have stronger causality biases? For this study we took care to avoid human raters being biased by obscenities and disfluencies generated by the model.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Belz, Anya; Thomson, Craig; Reiter, Ehud; Abercrombie, Gavin; Alonso-Moral, Jose M.; Arvan, Mohammad; Braggaar, Anouck; Cieliebak, Mark; Clark, Elizabeth; Deemter, Kees; Dinkar, Tanvi; Dušek, Ondřej; Eger, Steffen; Fang, Qixiang; Gao, Mingqi; Gatt, Albert; Gkatzia, Dimitra; González-Corbelle, Javier; Hovy, Dirk; Hürlimann, Manuela; Ito, Takumi; Kelleher, John D.; Klubicka, Filip; Krahmer, Emiel; Lai, Huiyuan; Lee, Chris; Li, Yiru; Mahamood, Saad; Mieskes, Margot; Miltenburg, Emiel; Mosteiro, Pablo; Nissim, Malvina; Parde, Natalie; Plátek, Ondřej; Rieser, Verena; Ruan, Jie; Tetreault, Joel; Toral, Antonio; Wan, Xiaojun; Wanner, Leo; Watson, Lewis; Yang, Diyi
Missing Information, Unresponsive Authors, Experimental Flaws: The Impossibility of Assessing the Reproducibility of Previous Human Evaluations in NLP Proceedings Article
In: Tafreshi, Shabnam; Akula, Arjun; Sedoc, João; Drozd, Aleksandr; Rogers, Anna; Rumshisky, Anna (Ed.): The Fourth Workshop on Insights from Negative Results in NLP, pp. 1–10, Association for Computational Linguistics, 2023.
@inproceedings{168ea12dfee34500b890cff6d859e673,
title = {Missing Information, Unresponsive Authors, Experimental Flaws: The Impossibility of Assessing the Reproducibility of Previous Human Evaluations in NLP},
author = {Anya Belz and Craig Thomson and Ehud Reiter and Gavin Abercrombie and Jose M. Alonso-Moral and Mohammad Arvan and Anouck Braggaar and Mark Cieliebak and Elizabeth Clark and Kees Deemter and Tanvi Dinkar and Ondřej Dušek and Steffen Eger and Qixiang Fang and Mingqi Gao and Albert Gatt and Dimitra Gkatzia and Javier González-Corbelle and Dirk Hovy and Manuela Hürlimann and Takumi Ito and John D. Kelleher and Filip Klubicka and Emiel Krahmer and Huiyuan Lai and Chris Lee and Yiru Li and Saad Mahamood and Margot Mieskes and Emiel Miltenburg and Pablo Mosteiro and Malvina Nissim and Natalie Parde and Ondřej Plátek and Verena Rieser and Jie Ruan and Joel Tetreault and Antonio Toral and Xiaojun Wan and Leo Wanner and Lewis Watson and Diyi Yang},
editor = {Shabnam Tafreshi and Arjun Akula and João Sedoc and Aleksandr Drozd and Anna Rogers and Anna Rumshisky},
doi = {10.18653/v1/2023.insights-1.1},
year = {2023},
date = {2023-05-00},
booktitle = {The Fourth Workshop on Insights from Negative Results in NLP},
pages = {1–10},
publisher = {Association for Computational Linguistics},
abstract = {We report our efforts in identifying a set of previous human evaluations in NLP that would be suitable for a coordinated study examining what makes human evaluations in NLP more/less reproducible. We present our results and findings, which include that just 13% of papers had (i) sufficiently low barriers to reproduction, and (ii) enough obtainable information, to be considered for reproduction, and that all but one of the experiments we selected for reproduction was discovered to have flaws that made the meaningfulness of conducting a reproduction questionable. As a result, we had to change our coordinated study design from a reproduce approach to a standardise-then-reproduce-twice approach. Our overall (negative) finding that the great majority of human evaluations in NLP is not repeatable and/or not reproducible and/or too flawed to justify reproduction, paints a dire picture, but presents an opportunity for a rethink about how to design and report human evaluations in NLP.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Clinciu, Miruna; Dušek, Ondřej; Gkatzia, Dimitra; Inglis, Stephanie; Leppänen, Leo; Mahamood, Saad; Schoch, Stephanie; Thomson, Craig; Wen, Luou
Barriers and enabling factors for error analysis in NLG research Tijdschriftartikel
In: Northern European Journal of Language Technology, vol. 9, nr. 1, pp. 1–22, 2023, ISSN: 2000-1533.
@article{ed6855082d5e43789a5a28c1b2646a27,
title = {Barriers and enabling factors for error analysis in NLG research},
author = {Emiel Miltenburg and Miruna Clinciu and Ondřej Dušek and Dimitra Gkatzia and Stephanie Inglis and Leo Leppänen and Saad Mahamood and Stephanie Schoch and Craig Thomson and Luou Wen},
doi = {10.3384/nejlt.2000-1533.2023.4529},
issn = {2000-1533},
year = {2023},
date = {2023-02-21},
journal = {Northern European Journal of Language Technology},
volume = {9},
number = {1},
pages = {1–22},
abstract = {Earlier research has shown that few studies in Natural Language Generation (NLG) evaluate their system outputs using an error analysis, despite known limitations of automatic evaluation metrics and human ratings. This position paper takes the stance that error analyses should be encouraged, and discusses several ways to do so. This paper is based on our shared experience as authors as well as a survey we distributed as a means of public consultation. We provide an overview of existing barriers to carrying out error analyses, and propose changes to improve error reporting in the NLG literature.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Miltenburg, Emiel
Evaluating NLG systems: A brief introduction Diversen
2023, (Originally published on the website of the International Conference on Natural Language Generation (INLG) 2023: https://inlg2023.github.io/eval_blog.html).
@misc{069e90bae42e40388d3955d537c63d6e,
title = {Evaluating NLG systems: A brief introduction},
author = {Emiel Miltenburg},
url = {https://inlg2023.github.io/eval_blog.html},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
abstract = {This year the International Conference on Natural Language Generation (INLG) will feature an award for the paper with the best evaluation. The purpose of this award is to provide an incentive for NLG researchers to pay more attention to the way they assess the output of their systems. This essay provides a short introduction to evaluation in NLG, explaining key terms and distinctions.},
note = {Originally published on the website of the International Conference on Natural Language Generation (INLG) 2023: https://inlg2023.github.io/eval_blog.html},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Coretta, Stefano; Casillas, Joseph V.; ...,; Miltenburg, Emiel; ...,; Roettger, Timo B.
Multidimensional Signals and Analytic Flexibility: Estimating Degrees of Freedom in Human-Speech Analyses Tijdschriftartikel
In: Advances in Methods and Practices in Psychological Science, vol. 6, nr. 3, pp. 25152459231162567, 2023.
@article{doi:10.1177/25152459231162567,
title = {Multidimensional Signals and Analytic Flexibility: Estimating Degrees of Freedom in Human-Speech Analyses},
author = {Stefano Coretta and Joseph V. Casillas and ... and Emiel Miltenburg and ... and Timo B. Roettger},
url = {https://doi.org/10.1177/25152459231162567},
doi = {10.1177/25152459231162567},
year = {2023},
date = {2023-01-01},
journal = {Advances in Methods and Practices in Psychological Science},
volume = {6},
number = {3},
pages = {25152459231162567},
abstract = {Recent empirical studies have highlighted the large degree of analytic flexibility in data analysis that can lead to substantially different conclusions based on the same data set. Thus, researchers have expressed their concerns that these researcher degrees of freedom might facilitate bias and can lead to claims that do not stand the test of time. Even greater flexibility is to be expected in fields in which the primary data lend themselves to a variety of possible operationalizations. The multidimensional, temporally extended nature of speech constitutes an ideal testing ground for assessing the variability in analytic approaches, which derives not only from aspects of statistical modeling but also from decisions regarding the quantification of the measured behavior. In this study, we gave the same speech-production data set to 46 teams of researchers and asked them to answer the same research question, resulting in substantial variability in reported effect sizes and their interpretation. Using Bayesian meta-analytic tools, we further found little to no evidence that the observed variability can be explained by analysts’ prior beliefs, expertise, or the perceived quality of their analyses. In light of this idiosyncratic variability, we recommend that researchers more transparently share details of their analysis, strengthen the link between theoretical construct and quantitative system, and calibrate their (un)certainty in their conclusions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2022
Braggaar, Anouck; Tomas, Frédéric; Blomsma, Peter; Hommes, Saar; Braun, Nadine; Miltenburg, Emiel; Lee, Chris; Goudbeek, Martijn; Krahmer, Emiel
A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019) Proceedings Article
In: Proceedings of the 15th International Conference on Natural Language Generation: Generation Challenges, pp. 86–93, Association for Computational Linguistics, 2022, ISBN: 978-1-955917-60-5.
@inproceedings{0a74ace0f2e74d07b6d4ac8bafbaa9df,
title = {A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019)},
author = {Anouck Braggaar and Frédéric Tomas and Peter Blomsma and Saar Hommes and Nadine Braun and Emiel Miltenburg and Chris Lee and Martijn Goudbeek and Emiel Krahmer},
url = {https://aclanthology.org/2022.inlg-genchal.13/},
isbn = {978-1-955917-60-5},
year = {2022},
date = {2022-07-01},
urldate = {2022-07-01},
booktitle = {Proceedings of the 15th International Conference on Natural Language Generation: Generation Challenges},
pages = {86–93},
publisher = {Association for Computational Linguistics},
abstract = {In this paper, we describe our reproduction effort of the paper: Towards Best Experiment Design for Evaluating Dialogue System Output by Santhanam and Shaikh (2019) for the 2022 ReproGen shared task. We aim to produce the same results, using different human evaluators, and a different implementation of the automatic metrics used in the original paper. Although overall the study posed some challenges to reproduce (e.g. difficulties with reproduction of automatic metrics and statistics), in the end we did find that the results generally replicate the findings of Santhanam and Shaikh (2019) and seem to follow similar trends.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Braggaar, Anouck; Martijn, Gabriella; Liebrecht, C.; Hooijdonk, Charlotte; Miltenburg, Emiel; Kunneman, Florian; Krahmer, Emiel; Hoeken, Hans; Molder, Hedwig
Smooth operators. Development and effects of personalized conversational AI Conferentie
2022, (The 32nd Meeting of Computational Linguistics in The Netherlands, CLIN ; Conference date: 17-06-2022 Through 17-06-2022).
@conference{b341c2d529e540a0bf734fbccfd68be2,
title = {Smooth operators. Development and effects of personalized conversational AI},
author = {Anouck Braggaar and Gabriella Martijn and C. Liebrecht and Charlotte Hooijdonk and Emiel Miltenburg and Florian Kunneman and Emiel Krahmer and Hans Hoeken and Hedwig Molder},
url = {https://clin2022.uvt.nl/},
year = {2022},
date = {2022-06-17},
abstract = {Organizations are increasingly implementing chatbots to provide customer service as chatbots are always available and can help customers quickly. However, there are still improvements to be made for chatbots to reach their full potential since 1) chatbot technology still faces some limitations, 2) customers perceive chatbot communication as unnatural and impersonal, and 3) customer service employees are still trying to find their way in collaborating with their new ‘colleague.’ In this 4-year NWO-funded project, we aim to develop and evaluate chatbots with a human touch to improve customers’ and employees’ collaboration and experience within a customer service context.In the first year of the project, we focused on the evaluation of customer service chatbots on the one hand, and the evaluation of the multifaceted collaboration between customer service employees and chatbots on the other hand.A systematic literature review was conducted to investigate how chatbots as task-based dialogue systems are evaluated within different fields of study. While the more technical fields (such as NLP) seem to focus to a great extent on automatic metrics, the more business-oriented fields (such as communication science) often make use of human evaluations. By conducting a search in four databases (ACL, ACM, IEEE and Web of Science) 3,800 records were retrieved that contained an evaluation of task-oriented dialogue systems/chatbots or discussed evaluation techniques. After screening, 146 studies were included in the literature review. These papers were assessed on what evaluation techniques were used, how they were used and in what context. The final goal of the study is to make an overview of metrics that are used in the technical fields and make them understandable and usable for the business-oriented fields.The perceptions of managers, conversational designers, and human agents regarding their criteria for evaluating human chatbot collaboration were examined by means of an interview study. Our study found that all parties used their own criteria to evaluate the collaboration and that the evaluation criteria used varied according to the job positions interviewees held. Managers evaluate the chatbot collaboration in terms of cost reduction. Conversational designers perceive both customers as well as human agents as their ‘customers’, focusing on customer satisfaction as their main evaluation criteria. Human agents evaluate the collaboration by looking at the extent to which collaborating with the chatbot has positively affected their job satisfaction and has resulted in traffic improvements. Finally, in terms of improvements, our results showed that both human agents and conversational designers advocate back-end integration of the chatbot to improve collaboration. However, it also became clear that with this collaboration, new dilemmas arise, such as team alignment and privacy issues related to the processing of personal data. Such insights could be considered in future chatbot design to make the collaboration within human chatbot teams run as smoothly as possible and in that respect benefit organizations, human agents, and customers.},
note = {The 32nd Meeting of Computational Linguistics in The Netherlands, CLIN ; Conference date: 17-06-2022 Through 17-06-2022},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
2021
Liebrecht, C.; Hooijdonk, Charlotte; Kunneman, Florian; Miltenburg, Emiel
“Hallo, ik ben Anna, uw virtuele assistent”: Talige kenmerken in customer service chatbots Tijdschriftartikel
In: DIXIT: tijdschrift over toegepaste taal- en spraaktechnologie, vol. 18, pp. 10–12, 2021, ISSN: 1572-6037.
@article{6738a263482f41eda068af6dec479d8a,
title = {“Hallo, ik ben Anna, uw virtuele assistent”: Talige kenmerken in customer service chatbots},
author = {C. Liebrecht and Charlotte Hooijdonk and Florian Kunneman and Emiel Miltenburg},
issn = {1572-6037},
year = {2021},
date = {2021-12-01},
journal = {DIXIT: tijdschrift over toegepaste taal- en spraaktechnologie},
volume = {18},
pages = {10–12},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Corone, Anna; Nanne, Annemarie; Miltenburg, Emiel
Controlling Social Media Data: a Case Study of the Effect of Social Presence on Consumers’ Engagement with Brand-generated Instagram Posts Proceedings Article
In: Hendrickx, Iris; Verheijen, Lieke; Wijngaert, Lidwien (Ed.): Proceedings of the 8th Conference on Computer-Mediated Communication CMC and Social Media Corpora (CMC-Corpora2021), pp. 25–29, Radboud University, 2021, (Conference on Computer-Mediated Communication CMC and Social Media Corpora, CMC-Corpora ; Conference date: 28-10-2021 Through 29-10-2021).
@inproceedings{5dce684aeba548c0bd1550c82f98593e,
title = {Controlling Social Media Data: a Case Study of the Effect of Social Presence on Consumers’ Engagement with Brand-generated Instagram Posts},
author = {Anna Corone and Annemarie Nanne and Emiel Miltenburg},
editor = {Iris Hendrickx and Lieke Verheijen and Lidwien Wijngaert},
url = {https://cmc-corpora.org/conferences/cmc-corpora2021/},
year = {2021},
date = {2021-10-01},
booktitle = {Proceedings of the 8th Conference on Computer-Mediated Communication CMC and Social Media Corpora (CMC-Corpora2021)},
pages = {25–29},
publisher = {Radboud University},
abstract = {Research in social media marketing studies ways to increase customers’ engagement with brand-generated social media posts. This can either be done through experiments, or corpus studies of existing social media posts. Experiments have the advantage that they are controlled, but they often lack ecological validity, while for corpus studies the reverse is often true. As a case study, we construct a corpus of 1761 brand-generated Instagram posts, looking at the effect of social presence (the perception of human contact) on different engagement metrics (likes and comments), taking the effect of possible confounds (theme of slogans, funniness, time) into account. We show how social media posts can be analyzed at different levels of granularity, to establish the strength of the effect of social presence. We hope that our work will help others to isolate the impact of different variables on post engagement on social media.},
note = {Conference on Computer-Mediated Communication CMC and Social Media Corpora, CMC-Corpora ; Conference date: 28-10-2021 Through 29-10-2021},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Gehrmann, Sebastian; Adewumi, Tosin; Aggarwal, Karmanya; Ammanamanchi, Pawan Sasanka; Aremu, Anuoluwapo; Bosselut, Antoine; Chandu, Khyathi Raghavi; Clinciu, Miruna-Adriana; Das, Dipanjan; Dhole, Kaustubh; Du, Wanyu; Durmus, Esin; Dušek, Ondřej; Emezue, Chris Chinenye; Gangal, Varun; Garbacea, Cristina; Hashimoto, Tatsunori; Hou, Yufang; Jernite, Yacine; Jhamtani, Harsh; Ji, Yangfeng; Jolly, Shailza; Kale, Mihir; Kumar, Dhruv; Ladhak, Faisal; Madaan, Aman; Maddela, Mounica; Mahajan, Khyati; Mahamood, Saad; Majumder, Bodhisattwa Prasad; Martins, Pedro Henrique; McMillan-Major, Angelina; Mille, Simon; Miltenburg, Emiel; Nadeem, Moin; Narayan, Shashi; Nikolaev, Vitaly; Rubungo, Andre Niyongabo; Osei, Salomey; Parikh, Ankur; Perez-Beltrachini, Laura; Rao, Niranjan Ramesh; Raunak, Vikas; Rodriguez, Juan Diego; Santhanam, Sashank; Sedoc, João; Sellam, Thibault; Shaikh, Samira; Shimorina, Anastasia; Cabezudo, Marco Antonio Sobrevilla; Strobelt, Hendrik; Subramani, Nishant; Xu, Wei; Yang, Diyi; Yerukola, Akhila; Zhou, Jiawei
The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics Proceedings Article
In: Proceedings of the 1st Workshop on Natural Language Generation, Evaluation, and Metrics (GEM 2021), pp. 96–120, Association for Computational Linguistics, 2021, (Workshop on Natural Language Generation, Evaluation, and Metrics , GEM2021 ; Conference date: 05-08-2021 Through 06-08-2021).
@inproceedings{c42a48c622ca4ea49e886a43cdf0b3f9,
title = {The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics},
author = {Sebastian Gehrmann and Tosin Adewumi and Karmanya Aggarwal and Pawan Sasanka Ammanamanchi and Anuoluwapo Aremu and Antoine Bosselut and Khyathi Raghavi Chandu and Miruna-Adriana Clinciu and Dipanjan Das and Kaustubh Dhole and Wanyu Du and Esin Durmus and Ondřej Dušek and Chris Chinenye Emezue and Varun Gangal and Cristina Garbacea and Tatsunori Hashimoto and Yufang Hou and Yacine Jernite and Harsh Jhamtani and Yangfeng Ji and Shailza Jolly and Mihir Kale and Dhruv Kumar and Faisal Ladhak and Aman Madaan and Mounica Maddela and Khyati Mahajan and Saad Mahamood and Bodhisattwa Prasad Majumder and Pedro Henrique Martins and Angelina McMillan-Major and Simon Mille and Emiel Miltenburg and Moin Nadeem and Shashi Narayan and Vitaly Nikolaev and Andre Niyongabo Rubungo and Salomey Osei and Ankur Parikh and Laura Perez-Beltrachini and Niranjan Ramesh Rao and Vikas Raunak and Juan Diego Rodriguez and Sashank Santhanam and João Sedoc and Thibault Sellam and Samira Shaikh and Anastasia Shimorina and Marco Antonio Sobrevilla Cabezudo and Hendrik Strobelt and Nishant Subramani and Wei Xu and Diyi Yang and Akhila Yerukola and Jiawei Zhou},
url = {https://www.aclweb.org/portal/content/first-workshop-generation-evaluation-and-metrics-acl-2021},
year = {2021},
date = {2021-08-00},
booktitle = {Proceedings of the 1st Workshop on Natural Language Generation, Evaluation, and Metrics (GEM 2021)},
pages = {96–120},
publisher = {Association for Computational Linguistics},
abstract = {We introduce GEM, a living benchmark for natural language Generation (NLG), its Evaluation, and Metrics. Measuring progress in NLG relies on a constantly evolving ecosystem of automated metrics, datasets, and human evaluation standards. Due to this moving target, new models often still evaluate on divergent anglo-centric corpora with well-established, but flawed, metrics. This disconnect makes it challenging to identify the limitations of current models and opportunities for progress. Addressing this limitation, GEM provides an environment in which models can easily be applied to a wide set of tasks and in which evaluation strategies can be tested. Regular updates to the benchmark will help NLG research become more multilingual and evolve the challenge alongside models. This paper serves as the description of the data for the 2021 shared task at the associated GEM Workshop.},
note = {Workshop on Natural Language Generation, Evaluation, and Metrics , GEM2021 ; Conference date: 05-08-2021 Through 06-08-2021},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Clinciu, Miruna; Dušek, Ondřej; Gkatzia, Dimitra; Inglis, Stephanie; Leppänen, Leo; Mahamood, Saad; Manning, Emma; Schoch, Stephanie; Thomson, Craig; Wen, Luou
Underreporting of errors in NLG output, and what to do about it Proceedings Article
In: Proceedings of the 14th International Conference on Natural Language Generation, pp. 140–153, Association for Computational Linguistics, 2021, (The 14th International Conference on Natural Language Generation, INLG ; Conference date: 20-09-2021 Through 24-09-2021).
@inproceedings{cb5ec7f25f5a44ee950fd1f20209b93b,
title = {Underreporting of errors in NLG output, and what to do about it},
author = {Emiel Miltenburg and Miruna Clinciu and Ondřej Dušek and Dimitra Gkatzia and Stephanie Inglis and Leo Leppänen and Saad Mahamood and Emma Manning and Stephanie Schoch and Craig Thomson and Luou Wen},
url = {https://aclanthology.org/2021.inlg-1.14/},
doi = {10.18653/v1/2021.inlg-1.14},
year = {2021},
date = {2021-08-00},
urldate = {2021-08-00},
booktitle = {Proceedings of the 14th International Conference on Natural Language Generation},
pages = {140–153},
publisher = {Association for Computational Linguistics},
abstract = {We observe a severe under-reporting of the different kinds of errors that Natural Language Generation systems make. This is a problem, because mistakes are an important indicator of where systems should still be improved. If authors only report overall performance metrics, the research community is left in the dark about the specific weaknesses that are exhibited by `state-of-the-art' research. Next to quantifying the extent of error under-reporting, this position paper provides recommendations for error identification, analysis and reporting.},
note = {The 14th International Conference on Natural Language Generation, INLG ; Conference date: 20-09-2021 Through 24-09-2021},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Lee, Chris; Krahmer, Emiel
Preregistering NLP research Proceedings Article
In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 613–623, Association for Computational Linguistics, 2021, (Human Language Technology Conference 2021, HLTCon ; Conference date: 16-03-2021 Through 18-03-2021).
@inproceedings{e5cce62278324800853fc8365a002ab8,
title = {Preregistering NLP research},
author = {Emiel Miltenburg and Chris Lee and Emiel Krahmer},
url = {https://www.hltcon.org/},
year = {2021},
date = {2021-06-00},
urldate = {2021-06-00},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
pages = {613–623},
publisher = {Association for Computational Linguistics},
abstract = {Preregistration refers to the practice of specifying what you are going to do, and what you expect to find in your study, before carrying out the study. This practice is increasingly common in medicine and psychology, but is rarely discussed in NLP. This paper discusses preregistration in more detail, explores how NLP researchers could preregister their work, and presents several preregistration questions for different kinds of studies. Finally, we argue in favour of registered reports, which could provide firmer grounds for slow science in NLP research. The goal of this paper is to elicit a discussion in the NLP community, which we hope to synthesise into a general NLP preregistration form in future research.},
note = {Human Language Technology Conference 2021, HLTCon ; Conference date: 16-03-2021 Through 18-03-2021},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Lee, Chris; Gatt, Albert; Miltenburg, Emiel; Krahmer, Emiel
Human evaluation of automatically generated text: Current trends and best practice guidelines Tijdschriftartikel
In: Computer Speech and Language: An official publication of the International Speech Communication Association (ISCA), vol. 67, pp. 1–24, 2021, ISSN: 0885-2308, (Funding Information: We received support from RAAK-PRO SIA (2014-01-51PRO) and The Netherlands Organization for Scientific Research (NWO 360-89-050), which is gratefully acknowledged. Furthermore, we want to extend our gratitude towards the anonymous reviewers and also towards Leshem Choshen, Ond?ej Du?ek, Kees van Deemter, Dimitra Gkatzia, David Howcroft, Ehud Reiter, and Sander Wubben for their valuable comments on the paper. Funding Information: We received support from RAAK-PRO SIA (2014-01-51PRO) and The Netherlands Organization for Scientific Research (NWO 360-89-050), which is gratefully acknowledged. Furthermore, we want to extend our gratitude towards the anonymous reviewers and also towards Leshem Choshen, Ondřej Dušek, Kees van Deemter, Dimitra Gkatzia, David Howcroft, Ehud Reiter, and Sander Wubben for their valuable comments on the paper. Publisher Copyright: © 2020 The Authors).
@article{d7e145ce52934367931192384e305b11,
title = {Human evaluation of automatically generated text: Current trends and best practice guidelines},
author = {Chris Lee and Albert Gatt and Emiel Miltenburg and Emiel Krahmer},
doi = {10.1016/j.csl.2020.101151},
issn = {0885-2308},
year = {2021},
date = {2021-05-21},
journal = {Computer Speech and Language: An official publication of the International Speech Communication Association (ISCA)},
volume = {67},
pages = {1–24},
publisher = {Academic Press},
abstract = {Currently, there is little agreement as to how Natural Language Generation (NLG) systems should be evaluated, with a particularly high degree of variation in the way that human evaluation is carried out. This paper provides an overview of how (mostly intrinsic) human evaluation is currently conducted and presents a set of best practices, grounded in the literature. These best practices are also linked to the stages that researchers go through when conducting an evaluation research (planning stage; execution and release stage), and the specific steps in these stages. With this paper, we hope to contribute to the quality and consistency of human evaluations in NLG. (C) 2020 The Authors. Published by Elsevier Ltd.},
note = {Funding Information: We received support from RAAK-PRO SIA (2014-01-51PRO) and The Netherlands Organization for Scientific Research (NWO 360-89-050), which is gratefully acknowledged. Furthermore, we want to extend our gratitude towards the anonymous reviewers and also towards Leshem Choshen, Ond?ej Du?ek, Kees van Deemter, Dimitra Gkatzia, David Howcroft, Ehud Reiter, and Sander Wubben for their valuable comments on the paper. Funding Information: We received support from RAAK-PRO SIA (2014-01-51PRO) and The Netherlands Organization for Scientific Research (NWO 360-89-050), which is gratefully acknowledged. Furthermore, we want to extend our gratitude towards the anonymous reviewers and also towards Leshem Choshen, Ondřej Dušek, Kees van Deemter, Dimitra Gkatzia, David Howcroft, Ehud Reiter, and Sander Wubben for their valuable comments on the paper. Publisher Copyright: © 2020 The Authors},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Mille, Simon; Dhole, Kaustubh; Mahamood, Saad; Perez-Beltrachini, Laura; Gangal, Varun Prashant; Kale, Mihir; Miltenburg, Emiel; Gehrmann, Sebastian
Automatic Construction of Evaluation Suites for Natural Language Generation Datasets Proceedings Article
In: Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1 (NeurIPS Datasets and Benchmarks 2021), 2021, (Conference on Neural Information Processing Systems 2021 : Datasets and Benchmarks , NeurIPS 2021 ; Conference date: 28-11-2021 Through 09-12-2021).
@inproceedings{c19e8a972ed84f198bedb221b12f5ba4,
title = {Automatic Construction of Evaluation Suites for Natural Language Generation Datasets},
author = {Simon Mille and Kaustubh Dhole and Saad Mahamood and Laura Perez-Beltrachini and Varun Prashant Gangal and Mihir Kale and Emiel Miltenburg and Sebastian Gehrmann},
url = {https://neurips.cc/},
year = {2021},
date = {2021-01-01},
booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks 1 (NeurIPS Datasets and Benchmarks 2021)},
abstract = {Machine learning approaches applied to NLP are often evaluated by summarizing their performance in a single number, for example accuracy. Since most test sets are constructed as an i.i.d. sample from the overall data, this approach overly simplifies the complexity of language and encourages overfitting to the head of the data distribution. As such, rare language phenomena or text about underrepresented groups are not equally included in the evaluation. To encourage more in-depth model analyses, researchers have proposed the use of multiple test sets, also called challenge sets, that assess specific capabilities of a model. In this paper, we develop a framework based on this idea which is able to generate controlled perturbations and identify subsets in text-to-scalar, text-to-text, or data-to-text settings. By applying this framework to the GEM generation benchmark, we develop evaluation suites made of 80 challenge sets, demonstrate the kinds of analyses that it enables, and shed light onto the limits of current generation models.},
note = {Conference on Neural Information Processing Systems 2021 : Datasets and Benchmarks , NeurIPS 2021 ; Conference date: 28-11-2021 Through 09-12-2021},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Miltenburg, Emiel; Lee, Chris; Castro-Ferreira, Thiago; Krahmer, Emiel
Evaluation rules! On the use of grammars and rule-based systems for NLG evaluation Proceedings Article
In: Proceedings of the 1st Workshop on Evaluating NLG Evaluation, pp. 17–27, Association for Computational Linguistics, 2020, (Workshop on Evaluating NLG Evaluation ; Conference date: 18-12-2020).
@inproceedings{ea98daa9081d4a58bbd0ba12912ec9aa,
title = {Evaluation rules! On the use of grammars and rule-based systems for NLG evaluation},
author = {Emiel Miltenburg and Chris Lee and Thiago Castro-Ferreira and Emiel Krahmer},
url = {https://evalnlg-workshop.github.io/},
year = {2020},
date = {2020-12-00},
booktitle = {Proceedings of the 1st Workshop on Evaluating NLG Evaluation},
pages = {17–27},
publisher = {Association for Computational Linguistics},
abstract = {NLG researchers often use uncontrolled corpora to train and evaluate their systems, using textual similarity metrics, such as BLEU. This position paper argues in favour of two alternative evaluation strategies, using grammars or rule-based systems. These strategies are particularly useful to identify the strengths and weaknesses of different systems. We contrast our proposals with the (extended) WebNLG dataset, which is revealed to have a skewed distribution of predicates. We predict that this distribution affects the quality of the predictions for systems trained on this data. However, this hypothesis can only be thoroughly tested (without any confounds) once we are able to systematically manipulate the skewness of the data, using a rule-based approach.},
note = {Workshop on Evaluating NLG Evaluation ; Conference date: 18-12-2020},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Lu, Wei-Ting; Krahmer, Emiel; Gatt, Albert; Chen, Guanyi; Li, Lin; Deemter, Kees
Gradations of Error Severity in Automatic Image Descriptions Proceedings Article
In: Proceedings of the 13th International Conference on Natural Language Generation, pp. 398–411, Association for Computational Linguistics, 2020.
@inproceedings{3cef1ba55633417f8dfab9d5f571583a,
title = {Gradations of Error Severity in Automatic Image Descriptions},
author = {Emiel Miltenburg and Wei-Ting Lu and Emiel Krahmer and Albert Gatt and Guanyi Chen and Lin Li and Kees Deemter},
url = {https://aclanthology.org/2020.inlg-1.45/},
doi = {10.18653/v1/2020.inlg-1.45},
year = {2020},
date = {2020-12-00},
urldate = {2020-12-00},
booktitle = {Proceedings of the 13th International Conference on Natural Language Generation},
pages = {398–411},
publisher = {Association for Computational Linguistics},
abstract = {Earlier research has shown that evaluation metrics based on textual similarity (e.g., BLEU, CIDEr, Meteor) do not correlate well with human evaluation scores for automatically generated text. We carried out an experiment with Chinese speakers, where we systematically manipulated image descriptions to contain different kinds of errors. Because our manipulated descriptions form minimal pairs with the reference descriptions, we are able to assess the impact of different kinds of errors on the perceived quality of the descriptions. Our results show that different kinds of errors elicit significantly different evaluation scores, even though all erroneous descriptions differ in only one character from the reference descriptions. Evaluation metrics based solely on textual similarity are unable to capture these differences, which (at least partially) explains their poor correlation with human judgments. Our work provides the foundations for future work, where we aim to understand why different errors are seen as more or less severe.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
How Do Image Description Systems Describe People? A Targeted Assessment of System Competence in the PEOPLE-domain Proceedings Article
In: Proceedings of the Second Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN), pp. 30–36, Association for Computational Linguistics, 2020, (Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge, LANTERN ; Conference date: 01-12-2020).
@inproceedings{506e4c4e9af348679884039a4437630a,
title = {How Do Image Description Systems Describe People? A Targeted Assessment of System Competence in the PEOPLE-domain},
author = {Emiel Miltenburg},
url = {https://www.lantern.uni-saarland.de/2020/},
year = {2020},
date = {2020-12-00},
booktitle = {Proceedings of the Second Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge (LANTERN)},
pages = {30–36},
publisher = {Association for Computational Linguistics},
abstract = {Evaluations of image description systems are typically domain-general: generated descriptions for the held-out test images are either compared to a set of reference descriptions (using automated metrics), or rated by human judges on one or more Likert scales (for fluency, overall quality, and other quality criteria). While useful, these evaluations do not tell us anything about the kinds of image descriptions that systems are able to produce. Or, phrased differently, these evaluations do not tell us anything about the cognitive capabilities of image description systems. This paper proposes a different kind of assessment, that is able to quantify the extent to which these systems are able to describe humans. This assessment is based on a manual characterisation (a context-free grammar) of English entity labels in the PEOPLE domain, to determine the range of possible outputs. We examined 9 systems to see what kinds of labels they actually use. We found that these systems only use a small subset of at most 13 different kinds of modifiers (e.g. tall and short modify HEIGHT, sad and happy modify MOOD), but 27 kinds of modifiers are never used. Future research could study these semantic dimensions in more detail.},
note = {Workshop on Beyond Vision and LANguage: inTEgrating Real-world kNowledge, LANTERN ; Conference date: 01-12-2020},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Howcroft, David M.; Belz, Anya; Clinciu, Miruna-Adriana; Gkatzia, Dimitra; Hasan, Sadid A.; Mahamood, Saad; Mille, Simon; Miltenburg, Emiel; Santhanam, Sashank; Rieser, Verena
Twenty Years of Confusion in Human Evaluation: NLG Needs Evaluation Sheets and Standardised Definitions Proceedings Article
In: Proceedings of the 13th International Conference on Natural Language Generation, pp. 169–182, Association for Computational Linguistics, 2020, (International Conference on Natural Language Generation, INLG 2020 ; Conference date: 15-12-2020 Through 18-12-2020).
@inproceedings{f73a13db89a042309e8a25c3fb845419,
title = {Twenty Years of Confusion in Human Evaluation: NLG Needs Evaluation Sheets and Standardised Definitions},
author = {David M. Howcroft and Anya Belz and Miruna-Adriana Clinciu and Dimitra Gkatzia and Sadid A. Hasan and Saad Mahamood and Simon Mille and Emiel Miltenburg and Sashank Santhanam and Verena Rieser},
url = {https://www.inlg2020.org/},
year = {2020},
date = {2020-12-00},
booktitle = {Proceedings of the 13th International Conference on Natural Language Generation},
pages = {169–182},
publisher = {Association for Computational Linguistics},
abstract = {Human assessment remains the most trusted form of evaluation in NLG, but highly diverse approaches and a proliferation of different quality criteria used by researchers make it difficult to compare results and draw conclusions across papers, with adverse implications for meta-evaluation and reproducibility. In this paper, we present (i) our dataset of 165 NLG papers with human evaluations, (ii) the annotation scheme we developed to label the papers for different aspects of evaluations, (iii) quantitative analyses of the annotations, and (iv) a set of recommendations for improving standards in evaluation reporting. We use the annotations as a basis for examining information included in evaluation reports, and levels of consistency in approaches, experimental design and terminology, focusing in particular on the 200+ different terms that have been used for evaluated aspects of quality. We conclude that due to a pervasive lack of clarity in reports and extreme diversity in approaches, human evaluation in NLG presents as extremely confused in 2020, and that the field is in urgent need of standard methods and terminology.},
note = {International Conference on Natural Language Generation, INLG 2020 ; Conference date: 15-12-2020 Through 18-12-2020},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Lee, Chris; Gatt, Albert; Miltenburg, Emiel; Wubben, Sander; Krahmer, Emiel
Best practices for the human evaluation of automatically generated text Proceedings Article
In: Proceedings of the 12th International Conference on Natural Language Generation, pp. 355–368, Association for Computational Linguistics, 2020, (12th International conference on Natural Language Generation (INLG 2019) ; Conference date: 29-10-2019 Through 01-11-2019).
@inproceedings{0c962280b7244ada878649fed4228c8a,
title = {Best practices for the human evaluation of automatically generated text},
author = {Chris Lee and Albert Gatt and Emiel Miltenburg and Sander Wubben and Emiel Krahmer},
url = {https://www.inlg2019.com},
year = {2020},
date = {2020-12-00},
urldate = {2020-12-00},
booktitle = {Proceedings of the 12th International Conference on Natural Language Generation},
pages = {355–368},
publisher = {Association for Computational Linguistics},
abstract = {Currently, there is little agreement as to how Natural Language Generation (NLG) systems should be evaluated. While there is some agreement regarding automatic metrics, there is a high degree of variation in the way that human evaluation is carried out. This paper provides an overview of how human evaluation is currently conducted, and presents a set of best practices, grounded in the literature. With this paper, we hope to contribute to the quality and consistency of human evaluations in NLG.},
note = {12th International conference on Natural Language Generation (INLG 2019) ; Conference date: 29-10-2019 Through 01-11-2019},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
On the use of human reference data for evaluating automatic image descriptions Conferentie
2020, (2020 VizWiz Grand Challenge Workshop, VizWiz 2020 ; Conference date: 14-06-2020 Through 14-06-2020).
@conference{002bdf085b5a46878b1b8c1d470f85df,
title = {On the use of human reference data for evaluating automatic image descriptions},
author = {Emiel Miltenburg},
url = {https://vizwiz.org/workshops/2020-workshop/},
year = {2020},
date = {2020-06-14},
abstract = {Automatic image description systems are commonly trained and evaluated using crowdsourced, human-generated image descriptions. The best-performing system is then determined using some measure of similarity to the reference data (BLEU, Meteor, CIDER, etc). Thus, both the quality of the systems as well as the quality of the evaluation depends on the quality of the descriptions. As Section 2 will show, the quality of current image description datasets is insufficient. I argue that there is a need for more detailed guidelines that take into account the needs of visually impaired users, but also the feasibility of generating suitable descriptions. With high-quality data, evaluation of image description systems could use reference descriptions, but we should also look for alternatives.},
note = {2020 VizWiz Grand Challenge Workshop, VizWiz 2020 ; Conference date: 14-06-2020 Through 14-06-2020},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Miltenburg, Emiel
Toevallige Haiku's Diversen
2020.
@misc{1b3e11d1d60841889dcf3279e54f0fbf,
title = {Toevallige Haiku's},
author = {Emiel Miltenburg},
url = {https://neerlandistiek.nl/2020/04/toevallige-haikus/},
year = {2020},
date = {2020-04-18},
urldate = {2020-04-18},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2019
Ferreira, Thiago Castro; Lee, Chris; Miltenburg, Emiel; Krahmer, Emiel
Neural data-to-text generation: A comparison between pipeline and end-to-end architectures Proceedings Article
In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 552–562, Association for Computational Linguistics, 2019, (2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing , (EMNLP-IJCNLP) ; Conference date: 03-11-2019 Through 07-11-2019).
@inproceedings{b0ed5e4c4c1e40d5b9369472dc05a3d1,
title = {Neural data-to-text generation: A comparison between pipeline and end-to-end architectures},
author = {Thiago Castro Ferreira and Chris Lee and Emiel Miltenburg and Emiel Krahmer},
url = {https://www.emnlp-ijcnlp2019.org/},
year = {2019},
date = {2019-11-00},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
pages = {552–562},
publisher = {Association for Computational Linguistics},
abstract = {Traditionally, most data-to-text applications have been designed using a modular pipeline architecture, in which non-linguistic input data is converted into natural language through several intermediate transformations. By contrast, recent neural models for data-to-text generation have been proposed as end-to-end approaches, where the non-linguistic input is rendered in natural language with much less explicit intermediate representations in between. This study introduces a systematic comparison between neural pipeline and end-to-end data-to-text approaches for the generation of text from RDF triples. Both architectures were implemented making use of the encoder-decoder Gated-Recurrent Units (GRU) and Transformer, two state-of-the art deep learning methods. Automatic and human evaluations together with a qualitative analysis suggest that having explicit intermediate steps in the generation process results in better texts than the ones generated by end-to-end approaches. Moreover, the pipeline models generalize better to unseen inputs. Data and code are publicly available.},
note = {2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing , (EMNLP-IJCNLP) ; Conference date: 03-11-2019 Through 07-11-2019},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
Pragmatic factors in (automatic) image description Proefschrift
Vrije Universiteit Amsterdam, 2019.
@phdthesis{38c45b74b30942f89412e6008ad3db1b,
title = {Pragmatic factors in (automatic) image description},
author = {Emiel Miltenburg},
url = {https://hdl.handle.net/1871.1/a0acdca0-0122-466f-9daa-3507d298fcd2},
year = {2019},
date = {2019-10-14},
urldate = {2019-10-14},
school = {Vrije Universiteit Amsterdam},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Ibrahimi, Sarah; Chen, Shuo; Arya, Devanshu; Câmara, Arthur; Chen, Yunlu; Crijns, Tanja; Goes, Maurits; Mensink, Thomas; Miltenburg, Emiel; Odijk, Daan; Thong, William; Zhao, Jiaojiao; Mettes, Pascal
Interactive Exploration of Journalistic Video Footage Through Multimodal Semantic Matching Proceedings Article
In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 2196–2198, ACM, 2019, ISBN: 9781450368896.
@inproceedings{e0b71f42b7474c34bacd91076a6b3ada,
title = {Interactive Exploration of Journalistic Video Footage Through Multimodal Semantic Matching},
author = {Sarah Ibrahimi and Shuo Chen and Devanshu Arya and Arthur Câmara and Yunlu Chen and Tanja Crijns and Maurits Goes and Thomas Mensink and Emiel Miltenburg and Daan Odijk and William Thong and Jiaojiao Zhao and Pascal Mettes},
doi = {10.1145/3343031.3350597},
isbn = {9781450368896},
year = {2019},
date = {2019-01-01},
booktitle = {Proceedings of the 27th ACM International Conference on Multimedia},
pages = {2196–2198},
publisher = {ACM},
series = {MM '19},
abstract = {This demo presents a system for journalists to explore video footage for broadcasts. Daily news broadcasts contain multiple news items that consist of many video shots and searching for relevant footage is a labor intensive task. Without the need for annotated video shots, our system extracts semantics from footage and automatically matches these semantics to query terms from the journalist. The journalist can then indicate which aspects of the query term need to be emphasized, e.g. the title or its thematic meaning. The goal of this system is to support the journalists in their search process by encouraging interaction and exploration with the system.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Kerkhof, Merel; Koolen, Ruud; Goudbeek, Martijn; Krahmer, Emiel
On task effects in NLG corpus elicitation: A replication study using mixed effects modeling Proceedings Article
In: Deemter, Kees; Lin, Chenghua; Takamura, Hiroya (Ed.): Proceedings of the 12th International Conference on Natural Language Generation (INLG 2019), 2019, (12th International conference on Natural Language Generation (INLG 2019) ; Conference date: 29-10-2019 Through 01-11-2019).
@inproceedings{122b84548e3e442d9b16df840b36cc02,
title = {On task effects in NLG corpus elicitation: A replication study using mixed effects modeling},
author = {Emiel Miltenburg and Merel Kerkhof and Ruud Koolen and Martijn Goudbeek and Emiel Krahmer},
editor = {Kees Deemter and Chenghua Lin and Hiroya Takamura},
url = {https://www.inlg2019.com},
year = {2019},
date = {2019-01-01},
booktitle = {Proceedings of the 12th International Conference on Natural Language Generation (INLG 2019)},
abstract = {Task effects in NLG corpus elicitation recently started to receive more attention, but are usu- ally not modeled statistically. We present a controlled replication of the study by Van Mil- tenburg et al. (2018b), contrasting spoken with written descriptions. We collected additional written Dutch descriptions to supplement the spoken data from the DIDEC corpus, and an- alyzed the descriptions using mixed effects modeling to account for variation between par- ticipants and items. Our results show that the effects of modality largely disappear in a con- trolled setting.},
note = {12th International conference on Natural Language Generation (INLG 2019) ; Conference date: 29-10-2019 Through 01-11-2019},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Miltenburg, Emiel; Elliott, Desmond; Vossen, Piek
Talking about other people: an endless range of possibilities Proceedings Article
In: Proceedings of the 11th International Conference on Natural Language Generation, pp. 415–420, Association for Computational Linguistics, 2018, (11th International Conference on Natural Language Generation, INLG 2018 ; Conference date: 05-11-2018 Through 08-11-2018).
@inproceedings{fb6af63b666145c8a11c73ab7608d304,
title = {Talking about other people: an endless range of possibilities},
author = {Emiel Miltenburg and Desmond Elliott and Piek Vossen},
url = {https://inlg2018.uvt.nl/},
year = {2018},
date = {2018-11-00},
booktitle = {Proceedings of the 11th International Conference on Natural Language Generation},
pages = {415–420},
publisher = {Association for Computational Linguistics},
abstract = {Image description datasets, such as Flickr30K and MS COCO, show a high degree of variation in the ways that crowd-workers talk about the world. Although this gives us a rich and diverse collection of data to work with, it also introduces uncertainty about how the world should be described. This paper shows the extent of this uncertainty in the PEOPLE-domain. We present a taxonomy of different ways to talk about other people. This taxonomy serves as a reference point to think about how other people should be described, and can be used to classify and compute statistics about labels applied to people.},
note = {11th International Conference on Natural Language Generation, INLG 2018 ; Conference date: 05-11-2018 Through 08-11-2018},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Elliott, Desmond; Vossen, Piek
Measuring the Diversity of Automatic Image Descriptions Proceedings Article
In: Proceedings of the 27th International Conference on Computational Linguistics, pp. 1730–1741, Association for Computational Linguistics, 2018, (International Conference on Computational Linguistics 2018, COLING 2018 ; Conference date: 20-08-2018 Through 26-08-2018).
@inproceedings{9266717f64c748ecb68e9beb1ac6b7e0,
title = {Measuring the Diversity of Automatic Image Descriptions},
author = {Emiel Miltenburg and Desmond Elliott and Piek Vossen},
url = {http://coling2018.org/},
year = {2018},
date = {2018-08-01},
urldate = {2018-08-01},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
pages = {1730–1741},
publisher = {Association for Computational Linguistics},
abstract = {Automatic image description systems typically produce generic sentences that only make use of a small subset of the vocabulary available to them. In this paper, we consider the production of generic descriptions as a lack of diversity in the output, which we quantify using established metrics and two new metrics that frame image description as a word recall task. This framing allows us to evaluate system performance on the head of the vocabulary, as well as on the long tail, where system performance degrades. We use these metrics to examine the diversity of the sentences generated by nine state-of-the-art systems on the MS COCO data set. We find that the systems trained with maximum likelihood objectives produce less diverse output than those trained with additional adversarial objectives. However, the adversarially-trained models only produce more types from the head of the vocabulary and not the tail. Besides vocabulary-based methods, we also look at the compositional capacity of the systems, specifically their ability to create compound nouns and prepositional phrases of different lengths. We conclude that there is still much room for improvement, and offer a toolkit to measure progress towards the goal of generating more diverse image descriptions.},
note = {International Conference on Computational Linguistics 2018, COLING 2018 ; Conference date: 20-08-2018 Through 26-08-2018},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Kadar, Akos; Koolen, Ruud; Krahmer, Emiel
DIDEC: The Dutch Image Description and Eye-tracking Corpus Proceedings Article
In: Proceedings of the 27th International Conference on Computational Linguistics, pp. 3658–3669, 2018, (International Conference on Computational Linguistics 2018, COLING 2018 ; Conference date: 20-08-2018 Through 26-08-2018).
@inproceedings{2910d9ba0ee9494386316bd424809701,
title = {DIDEC: The Dutch Image Description and Eye-tracking Corpus},
author = {Emiel Miltenburg and Akos Kadar and Ruud Koolen and Emiel Krahmer},
url = {http://coling2018.org/},
year = {2018},
date = {2018-01-01},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
pages = {3658–3669},
abstract = {We present a corpus of spoken Dutch image descriptions, paired with two sets of eye-tracking data: free viewing, where participants look at images without any particular purpose, and description viewing, where we track eye movements while participants produce spoken descriptions of the images they are viewing. This paper describes the data collection procedure and the corpus itself, and provides an initial analysis of self-corrections in image descriptions. We also present two studies showing the potential of this data. Though these studies mainly serve as an example, we do find two interesting results: (1) the eye-tracking data for the description viewing task is more coherent than for the free-viewing task; (2) variation in image descriptions (also called image specificity; Jas and Parikh, 2015) is only moderately correlated across different languages. Our corpus can be used to gain a deeper understanding of the image description task, particularly how visual attention is correlated with the image description process.},
note = {International Conference on Computational Linguistics 2018, COLING 2018 ; Conference date: 20-08-2018 Through 26-08-2018},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Koolen, Ruud; Krahmer, Emiel
Varying image description tasks: spoken versus written descriptions Proceedings Article
In: Proceedings of the Fifth Workshop on NLP for Similar Languages, Varieties and Dialects, pp. 88–100, 2018, (5th Workshop on NLP for Similar Languages, Varieties and Dialects, VarDial ; Conference date: 20-08-2018).
@inproceedings{5e6d7632be294e518c75c94c352d5ec7,
title = {Varying image description tasks: spoken versus written descriptions},
author = {Emiel Miltenburg and Ruud Koolen and Emiel Krahmer},
url = {https://aclanthology.org/W18-3910/},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
booktitle = {Proceedings of the Fifth Workshop on NLP for Similar Languages, Varieties and Dialects},
pages = {88–100},
abstract = {Automatic image description systems are commonly trained and evaluated on written image descriptions. At the same time, these systems are often used to provide spoken descriptions (e.g., for visually impaired users) through apps like TapTapSee or Seeing AI. This is not a problem, as long as spoken and written descriptions are very similar. However, linguistic research suggests that spoken language often differs from written language. These differences are not regular and vary from context to context. Therefore, this paper investigates whether there are differences between written and spoken image descriptions, even if they are elicited through similar tasks. We compared descriptions produced in two languages (English and Dutch) and found substantial differences between spoken and written descriptions in both languages. Future research should examine if users prefer the spoken over the written style and, if so, aim to emulate spoken descriptions.},
note = {5th Workshop on NLP for Similar Languages, Varieties and Dialects, VarDial ; Conference date: 20-08-2018},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2017
Miltenburg, Emiel; Elliott, Desmond; Vossen, Piek
Cross-linguistic differences and similarities in image descriptions Proceedings Article
In: Proceedings of the 10th International Conference on Natural Language Generation, pp. 21–30, Association for Computational Linguistics, 2017.
@inproceedings{606b60f3166d4e55a07979d9c73232cc,
title = {Cross-linguistic differences and similarities in image descriptions},
author = {Emiel Miltenburg and Desmond Elliott and Piek Vossen},
url = {https://aclanthology.org/W17-3503/},
year = {2017},
date = {2017-12-00},
urldate = {2017-12-00},
booktitle = {Proceedings of the 10th International Conference on Natural Language Generation},
pages = {21–30},
publisher = {Association for Computational Linguistics},
abstract = {Automatic image description systems are commonly trained and evaluated on large image description datasets. Recently, researchers have started to collect such datasets for languages other than English. An unexplored question is how different these datasets are from English and, if there are any differences, what causes them to differ. This paper provides a cross-linguistic comparison of Dutch, English, and German image descriptions. We find that these descriptions are similar in many respects, but the familiarity of crowd workers with the subjects of the images has a noticeable influence on the specificity of the descriptions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
Pragmatic descriptions of perceptual stimuli Proceedings Article
In: Proceedings of the Student Research Workshop at the 15th Conference of the European Chapter of the Association for Computational Linguistics, pp. 1–10, Association for Computational Linguistics, 2017.
@inproceedings{4040b4067c8e4a408f4a01e2b5411836,
title = {Pragmatic descriptions of perceptual stimuli},
author = {Emiel Miltenburg},
url = {https://aclanthology.org/E17-4001/},
year = {2017},
date = {2017-04-00},
urldate = {2017-04-00},
booktitle = {Proceedings of the Student Research Workshop at the 15th Conference of the European Chapter of the Association for Computational Linguistics},
pages = {1–10},
publisher = {Association for Computational Linguistics},
abstract = {This research proposal discusses pragmatic factors in image description, arguing that current automatic image description systems do not take these factors into account. I present a general model of the human image description process, and propose to study this process using corpus analysis, experiments, and computational modeling. This will lead to a better characterization of human image description behavior, providing a road map for future research in automatic image description, and the automatic description of perceptual stimuli in general.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2016
Son, Chantal; Miltenburg, Emiel; Morante, Roser
Building a Dictionary of Affixal Negations Proceedings Article
In: Proceedings of the Workshop on Extra-Propositional Aspects of Meaning in Computational Linguistics (ExProM), pp. 49–56, The COLING 2016 Organizing Committee, 2016.
@inproceedings{72012e280e494b8382ca778315ff11c8,
title = {Building a Dictionary of Affixal Negations},
author = {Chantal Son and Emiel Miltenburg and Roser Morante},
url = {https://aclanthology.org/W16-5007/},
year = {2016},
date = {2016-12-00},
urldate = {2016-12-00},
booktitle = {Proceedings of the Workshop on Extra-Propositional Aspects of Meaning in Computational Linguistics (ExProM)},
pages = {49–56},
publisher = {The COLING 2016 Organizing Committee},
abstract = {This paper discusses the need for a dictionary of affixal negations and regular antonyms to facilitate their automatic detection in text. Without such a dictionary, affixal negations are very difficult to detect. In addition, we show that the set of affixal negations is not homogeneous, and that different NLP tasks may require different subsets. A dictionary can store the subtypes of affixal negations, making it possible to select a certain subset or to make inferences on the basis of these subtypes. We take a first step towards creating a negation dictionary by annotating all direct antonym pairs inWordNet using an existing typology of affixal negations. By highlighting some of the issues that were encountered in this annotation experiment, we hope to provide some insights into the necessary steps of building a negation dictionary.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel; Morante, Roser; Elliott, Desmond
Pragmatic Factors in Image Description: The Case of Negations Proceedings Article
In: Proceedings of the 5th Workshop on Vision and Language, pp. 54–59, Association for Computational Linguistics, 2016.
@inproceedings{bfc23ef34ee44ea0a687cc19ad84d687,
title = {Pragmatic Factors in Image Description: The Case of Negations},
author = {Emiel Miltenburg and Roser Morante and Desmond Elliott},
url = {https://aclanthology.org/W16-3207/},
year = {2016},
date = {2016-08-00},
urldate = {2016-08-00},
booktitle = {Proceedings of the 5th Workshop on Vision and Language},
pages = {54–59},
publisher = {Association for Computational Linguistics},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Postma, Marten; Miltenburg, Emiel; Segers, Roxane; Schoen, Anneleen; Vossen, Piek
Open Dutch WordNet Proceedings Article
In: Proceedings of the Eighth Global Wordnet Conference, 2016.
@inproceedings{3eeeaa00748845c88c7c18115bfbe3b2,
title = {Open Dutch WordNet},
author = {Marten Postma and Emiel Miltenburg and Roxane Segers and Anneleen Schoen and Piek Vossen},
url = {https://aclanthology.org/2016.gwc-1.43/},
year = {2016},
date = {2016-01-30},
urldate = {2016-01-30},
booktitle = {Proceedings of the Eighth Global Wordnet Conference},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Miltenburg, Emiel
Review of Semantic Similarity from Natural Language and Ontology Analysis Tijdschriftartikel
In: Linguist List Issues, 2016, ISSN: 1068-4875.
@article{3073e3ddde3c4c47a18bd56493ba2c73,
title = {Review of Semantic Similarity from Natural Language and Ontology Analysis},
author = {Emiel Miltenburg},
url = {https://linguistlist.org/issues/27/2006/},
issn = {1068-4875},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
journal = {Linguist List Issues},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Tiel, Bob Van; Miltenburg, Emiel Van; Zevakhina, Natalia; Geurts, Bart
Scalar Diversity Tijdschriftartikel
In: Journal of semantics, vol. 33, nr. 1, pp. 137–175, 2016, ISSN: 0167-5133.
@article{dc18cbe8813844f6ac36fd8a69eb953f,
title = {Scalar Diversity},
author = {Bob Van Tiel and Emiel Van Miltenburg and Natalia Zevakhina and Bart Geurts},
doi = {10.1093/jos/ffu017},
issn = {0167-5133},
year = {2016},
date = {2016-01-01},
journal = {Journal of semantics},
volume = {33},
number = {1},
pages = {137–175},
publisher = {Oxford University Press},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Miltenburg, Emiel
Stereotyping and Bias in the Flickr30k Dataset Proceedings Article
In: Edlund, Jens; Heylen, Dirk; Paggio, Patrizia (Ed.): Proceedings of Multimodal Corpora, pp. 1–4, 2016.
@inproceedings{1a69ca4cb5f64cb9a6bf10719f831f7e,
title = {Stereotyping and Bias in the Flickr30k Dataset},
author = {Emiel Miltenburg},
editor = {Jens Edlund and Dirk Heylen and Patrizia Paggio},
url = {https://arxiv.org/abs/1605.06083},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
booktitle = {Proceedings of Multimodal Corpora},
pages = {1–4},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}