<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
        {font-family:Helvetica;
        panose-1:2 11 6 4 2 2 2 2 2 4;}
@font-face
        {font-family:Wingdings;
        panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:"Calibri Light";
        panose-1:2 15 3 2 2 2 4 3 2 4;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:Garamond;
        panose-1:2 2 4 4 3 3 1 1 8 3;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0cm;
        margin-bottom:.0001pt;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
h4
        {mso-style-priority:9;
        mso-style-link:"Heading 4 Char";
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:12.0pt;
        font-family:"Calibri",sans-serif;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:#0563C1;
        text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
        {mso-style-priority:99;
        color:#954F72;
        text-decoration:underline;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin-top:0cm;
        margin-right:0cm;
        margin-bottom:0cm;
        margin-left:36.0pt;
        margin-bottom:.0001pt;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
span.Heading4Char
        {mso-style-name:"Heading 4 Char";
        mso-style-priority:9;
        mso-style-link:"Heading 4";
        font-family:"Calibri Light",sans-serif;
        color:#2F5496;
        font-style:italic;}
p.msonormal0, li.msonormal0, div.msonormal0
        {mso-style-name:msonormal;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:12.0pt;
        font-family:"Times New Roman",serif;}
span.EmailStyle20
        {mso-style-type:personal;
        font-family:"Calibri",sans-serif;
        color:windowtext;}
span.EmailStyle21
        {mso-style-type:personal-reply;
        font-family:"Calibri",sans-serif;
        color:#1F497D;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:612.0pt 792.0pt;
        margin:72.0pt 72.0pt 72.0pt 72.0pt;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:510069797;
        mso-list-type:hybrid;
        mso-list-template-ids:492076288 -627683698 -156891162 -2101312760 691421774 902879454 1343669878 813465894 616351260 -1707078572;}
@list l0:level1
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:36.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level2
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:72.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level3
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:108.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level4
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:144.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level5
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:180.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level6
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:216.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level7
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:252.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level8
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:288.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level9
        {mso-level-number-format:bullet;
        mso-level-text:§;
        mso-level-tab-stop:324.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l1
        {mso-list-id:745225051;
        mso-list-template-ids:-1476902708;}
@list l1:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:36.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:72.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:108.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:144.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:180.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:216.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:252.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:288.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
@list l1:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:324.0pt;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        mso-ansi-font-size:10.0pt;
        font-family:Symbol;}
ol
        {margin-bottom:0cm;}
ul
        {margin-bottom:0cm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-US" link="#0563C1" vlink="#954F72">
<div class="WordSection1">
<p class="MsoNormal"><span style="color:#1F497D">Dear Ernie,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D"><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">In their 2018 report “Common Patterns in Revolutionary Infrastructures and Data” (<a href="https://www.rd-alliance.org/sites/default/files/Common_Patterns_in_Revolutionising_Infrastructures-final.pdf">https://www.rd-alliance.org/sites/default/files/Common_Patterns_in_Revolutionising_Infrastructures-final.pdf</a>
 ), Peter Wittenburg and George Strawn write “Results from surveys and interviews indicate that current data management and processing mechanisms are highly inefficient. An RDA survey from 2013[16] stated that typically a data scientist is spending 75% of his
 time on “data wrangling”[17]. M. Brodie reported about an MIT study [3] indicating that data scientists spend 80% on data wrangling and a recent study from CrowdFlower[18] also came up with 79% of the time being spent on data wrangling in industry.”
<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">[3] M. L. Brodie, Understanding Data Science: An Emerging Discipline for Data-Intensive Discovery, keynote,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">Proc.of the XVII Int’l Conf Data Analytics and Management in Data Intensive Domains (DAMDID’2015), Obninsk,<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="SV" style="color:#1F497D">Russia, October 13-16, 2015.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="SV" style="color:#1F497D">[16] RDA EU survey: http://hdl.handle.net/11304/6e1424cc-8927-11e4-ac7e-860aa0063d1f<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">[17] "Data Wrangling includes all preparatory steps necessary to finally start the analytics.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">[18] Crowdflower: <a href="https://visit.crowdflower.com/WC-2017-Data-Science-Report_LP.html">
https://visit.crowdflower.com/WC-2017-Data-Science-Report_LP.html</a><o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D"><o:p> </o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">Hope this helps!<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D">/Maggie<o:p></o:p></span></p>
<p class="MsoNormal"><span style="color:#1F497D"><o:p> </o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal"><b>From:</b> CODATA-international <codata-international-bounces@lists.codata.org>
<b>On Behalf Of </b>Johnson, Jon<br>
<b>Sent:</b> Friday, December 11, 2020 10:00<br>
<b>To:</b> Ernie Boyko <boykern@yahoo.com>; CODATA International <codata-international@lists.codata.org><br>
<b>Subject:</b> Re: [CODATA-international] Cost of Data Wrangling<o:p></o:p></p>
</div>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span lang="EN-GB">Hi Eric<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB">It’s a bit of an urban myth I think see <a href="https://blog.ldodds.com/2020/01/31/do-data-scientists-spend-80-of-their-time-cleaning-data-turns-out-no/">
https://blog.ldodds.com/2020/01/31/do-data-scientists-spend-80-of-their-time-cleaning-data-turns-out-no/</a>, but it aligns with the Pareto Principle, so we are all willing to go with it!<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB">I suppose it is not that important whether it is 80% or 60%, it’s still a massive problem and the takeaway is that it highlights where the source of most effort is being expended, and strongly suggests that it arises
 from poor data quality and lack of metadata to manage that.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB">Jon Johnson <o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB">CLOSER, UCL Institute of Social Research<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB">@spuddybike<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB"><o:p> </o:p></span></p>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal"><b><span lang="EN-GB" style="font-size:12.0pt;color:black">From:
</span></b><span lang="EN-GB" style="font-size:12.0pt;color:black">CODATA-international <<a href="mailto:codata-international-bounces@lists.codata.org">codata-international-bounces@lists.codata.org</a>> on behalf of Ernie Boyko <<a href="mailto:boykern@yahoo.com">boykern@yahoo.com</a>><br>
<b>Reply to: </b>Ernie Boyko <<a href="mailto:boykern@yahoo.com">boykern@yahoo.com</a>><br>
<b>Date: </b>Friday, 11 December 2020 at 07:24<br>
<b>To: </b>CODATA International <<a href="mailto:codata-international@lists.codata.org">codata-international@lists.codata.org</a>><br>
<b>Subject: </b>[CODATA-international] Cost of Data Wrangling<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB"><o:p> </o:p></span></p>
</div>
<div>
<div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif">Hi all<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif">A study conducted for the EU? is often quoted as being the source of a statement along the lines of<o:p></o:p></span></p>
<div>
<div style="margin-left:50.4pt;margin-bottom:6.0pt">
<p class="MsoNormal" style="margin-left:72.0pt;text-indent:-18.0pt;mso-list:l0 level2 lfo3">
<![if !supportLists]><span lang="EN-GB" style="font-size:14.0pt;font-family:Wingdings"><span style="mso-list:Ignore">§<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><b><span style="font-size:14.0pt;color:#404040">80% of effort in data intensive research is used on data wrangling; conservative estimate of 10.2 Bn Euro. </span></b><span lang="EN-GB" style="font-size:14.0pt;font-family:"Times New Roman",serif"><o:p></o:p></span></p>
</div>
</div>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif;color:black"> Can anyone on this list point me to this study?</span><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif">Many thanks in advance.  I am trying to make the case for the benefits of developing a career stream for data wranglers/data stewards.<o:p></o:p></span></p>
</div>
<div>
<div id="ydp112ffa72yui_3_13_0_1_1383570742306_21877">
<div id="ydp112ffa72yui_3_13_0_1_1383570742306_21877">
<div id="ydp112ffa72yui_3_13_0_1_1383923334020_29546">
<div id="ydp112ffa72yui_3_13_0_1_1383923334020_29543">
<p class="MsoNormal"><span lang="EN-GB" style="font-size:13.5pt;font-family:"Times New Roman",serif;color:#4C76A2;background:white">Cheers, Ernie</span><span lang="EN-GB" style="font-size:13.5pt;font-family:"Times New Roman",serif"><o:p></o:p></span></p>
<p class="MsoNormal" style="background:white"><span lang="EN-GB" style="font-size:13.5pt;font-family:"Times New Roman",serif;color:black">+1-613-290-2804</span><span lang="EN-GB" style="font-size:13.5pt;font-family:"Times New Roman",serif"><o:p></o:p></span></p>
<p class="MsoNormal" style="background:white"><span lang="EN-GB" style="font-size:13.5pt;font-family:"Times New Roman",serif"><o:p> </o:p></span></p>
<p class="MsoNormal" style="background:white"><b><span lang="EN-GB" style="font-size:12.0pt;font-family:"Helvetica",sans-serif;color:black"> </span></b><b><span lang="EN-GB" style="font-size:13.5pt;font-family:"Garamond",serif;color:black"> “Data is the new
 oil.” — Clive Humby</span></b><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif"><o:p></o:p></span></p>
<div>
<h4 style="margin:0cm;margin-bottom:.0001pt;line-height:15.75pt;background:white;vertical-align:baseline;font-stretch:inherit">
<span lang="EN-GB" style="font-family:"Garamond",serif;color:black;border:none windowtext 1.0pt;padding:0cm">“Data really powers everything that we do.” – Jeff Weiner<o:p></o:p></span></h4>
<p class="MsoNormal" style="margin-bottom:12.0pt;background:white"><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif"><o:p> </o:p></span></p>
</div>
<div>
<p class="MsoNormal" style="margin-bottom:12.0pt;background:white"><b><span lang="EN-GB" style="font-size:12.0pt;font-family:"Times New Roman",serif"><o:p> </o:p></span></b></p>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
</html>