├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── demo.py
├── images
    ├── loc-narr.gif
    ├── paper_thumb_1.jpeg
    ├── paper_thumb_10.jpeg
    ├── paper_thumb_11.jpeg
    ├── paper_thumb_12.jpeg
    ├── paper_thumb_13.jpeg
    ├── paper_thumb_14.jpeg
    ├── paper_thumb_2.jpeg
    ├── paper_thumb_3.jpeg
    ├── paper_thumb_4.jpeg
    ├── paper_thumb_5.jpeg
    ├── paper_thumb_6.jpeg
    ├── paper_thumb_7.jpeg
    ├── paper_thumb_8.jpeg
    └── paper_thumb_9.jpeg
├── index.html
├── localized_narratives.py
├── transcription_example.py
└── web.js


/.gitignore:
--------------------------------------------------------------------------------
1 | speech_api_env
2 | .idea/
3 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Localized Narratives
2 | Visit the [project page](https://google.github.io/localized-narratives) for all the information about Localized Narratives, data downloads, visualizations, and much more. 
3 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # coding=utf-8
 3 | # Copyright 2020 The Google Research Authors.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Demo usage of the Localized Narratives data loader."""
17 | import localized_narratives
18 | 
19 | # This folder is where you would like to download the annotation files to and
20 | # where to read them from.
21 | local_dir = '/path/to/downloaded/data'
22 | 
23 | # The DataLoader class allows us to download the data and read it from file.
24 | data_loader = localized_narratives.DataLoader(local_dir)
25 | 
26 | # Downloads the annotation files (it first checks if they are not downloaded).
27 | data_loader.download_annotations('coco_val')
28 | 
29 | # Iterates through all or a limited number of (e.g. 1 in this case) annotations
30 | # for all files found in the local folder for a given dataset and split. E.g.
31 | # for `open_images_train` it will read only one shard if only one file was
32 | # downloaded manually.
33 | loc_narr = next(data_loader.load_annotations('coco_val', 1))
34 | 
35 | print(f'\nLocalized Narrative sample:\n{loc_narr}')
36 | 
37 | print(f'\nVoice recording URL:\n {loc_narr.voice_recording_url}\n')
38 | 


--------------------------------------------------------------------------------
/images/loc-narr.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/loc-narr.gif


--------------------------------------------------------------------------------
/images/paper_thumb_1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_1.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_10.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_10.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_11.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_11.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_12.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_12.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_13.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_13.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_14.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_14.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_2.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_3.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_4.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_5.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_6.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_6.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_7.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_7.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_8.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_8.jpeg


--------------------------------------------------------------------------------
/images/paper_thumb_9.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/localized-narratives/5c5b3031bc6feb1b453410b8cedece4541cf6e7c/images/paper_thumb_9.jpeg


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
   1 | <!DOCTYPE html><html><head><!--
   2 | @license
   3 | Copyright (c) 2017 The Polymer Project Authors. All rights reserved.
   4 | This code may only be used under the BSD style license found at http://polymer.github.io/LICENSE.txt
   5 | The complete set of authors may be found at http://polymer.github.io/AUTHORS.txt
   6 | The complete set of contributors may be found at http://polymer.github.io/CONTRIBUTORS.txt
   7 | Code distributed by Google as part of the polymer project is also
   8 | subject to an additional IP rights grant found at http://polymer.github.io/PATENTS.txt
   9 | --><!--
  10 | @license
  11 | Copyright (c) 2015 The Polymer Project Authors. All rights reserved.
  12 | This code may only be used under the BSD style license found at http://polymer.github.io/LICENSE.txt
  13 | The complete set of authors may be found at http://polymer.github.io/AUTHORS.txt
  14 | The complete set of contributors may be found at http://polymer.github.io/CONTRIBUTORS.txt
  15 | Code distributed by Google as part of the polymer project is also
  16 | subject to an additional IP rights grant found at http://polymer.github.io/PATENTS.txt
  17 | -->
  18 | 
  19 |     <script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-116935204-3"></script>
  20 |     <meta charset="utf-8">
  21 |   <title>Localized Narratives</title>
  22 |   <meta content="Localized Narratives" name="description">
  23 |   <meta content="Jordi Pont-Tuset" name="author">
  24 |   <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport">
  25 | 
  26 |   <link href="https://fonts.googleapis.com/css?family=Open+Sans:300" rel="stylesheet">
  27 |   <link href="https://fonts.googleapis.com/css?family=Courier+Prime" rel="stylesheet">
  28 | 
  29 |   </head>
  30 | <body><div hidden="" by-vulcanize=""><dom-module id="loco-image" assetpath="/loco-image/">
  31 |   <template>
  32 |     <style>
  33 |       /* This is needed to make sure the polymer component has an
  34 |       offsetWidth. */
  35 |       :host {
  36 |         display: block;
  37 |       }
  38 |       #main, #canvasContainer {
  39 |         width: 100%;
  40 |       }
  41 |       svg {
  42 |         border-style: solid;
  43 |       }
  44 |       .flexcol {
  45 |         height: 100%;
  46 |         display: flex;
  47 |         flex-direction: column;
  48 |         justify-content: center;
  49 |       }
  50 |       .flexrow {
  51 |         display: flex;
  52 |         flex-direction: row;
  53 |         justify-content: center;
  54 |       }
  55 |     </style>
  56 |     <div id="main">
  57 |       <template is="dom-if" if="[[title]]">
  58 |         <div>[[title]]</div>
  59 |       </template>
  60 |       
  61 |       <div id="canvasContainer" hidden$="[[!imageLoaded_]]">
  62 |         <svg id="canvas" on-tap="handleTap_" height$="[[canvasHeight]]" width$="[[canvasWidth]]" style="[[getCanvasStyle_(effectiveFontSize_, borderWidth)]]">
  63 |           <g id="image" transform$="[[imageZoomTransform_]]">
  64 |             <image href$="[[effectiveImageUrl_]]" height$="[[imageHeight]]" width$="[[imageWidth]]">
  65 |               <slot id="content"></slot>
  66 |             </image>
  67 |           </g>
  68 |         </svg>
  69 |       </div>
  70 |       <div hidden$="[[imageLoaded_]]" id="loader" style="[[getLoaderStyle_(borderWidth, canvasHeight, canvasWidth)]]">
  71 |         <div class="flexcol">
  72 |           <div class="flexrow">
  73 |             <paper-spinner-lite active="[[!imageLoadingHasError_]]">
  74 |             </paper-spinner-lite>
  75 |             <span hidden$="[[!imageLoadingHasError_]]">
  76 |               Failed to load image from [[imageUrl]].
  77 |             </span>
  78 |           </div>
  79 |         </div>
  80 |       </div>
  81 | 
  82 |       <div hidden="">
  83 |         <span>Client Coordinates: ([[clientPoint_.x]], [[clientPoint_.y]])</span><br>
  84 |         <span>Canvas Coordinates: ([[canvasPoint_.x]], [[canvasPoint_.y]])</span><br>
  85 |         <span>Image Coordinates: ([[imagePoint.x]], [[imagePoint.y]])</span><br>
  86 |         <span>ViewBox Coordinates: ([[viewBoxPoint_.x]], [[viewBoxPoint_.y]])</span><br>
  87 |         <span>imageTransform translation: ([[imageTransform_.x]],[[imageTransform_.y]])&gt;</span><br>
  88 |         <span>imageTransform scale: [[imageTransform_.scale]]</span><br>
  89 |         <span>ScaleRatio_: [[scaleRatio]]</span>
  90 |       </div>
  91 |     </div>
  92 |   </template>
  93 |   </dom-module>
  94 | <dom-module id="loco-bounding-box" assetpath="/loco-bounding-box/">
  95 |   <template>
  96 |     
  97 |     <svg>
  98 |       <g id="box">
  99 |         
 100 |         <rect x$="[[xmin]]" y$="[[ymin]]" height$="[[height]]" width$="[[width]]" stroke$="[[borderColorRgba]]" stroke-width$="[[effectiveBorderWidth_]]" stroke-dasharray$="[[borderDashArray]]" fill="none"></rect>
 101 |         
 102 |         
 103 |         <foreignObject x$="[[xmin]]" y$="[[ymin]]" height$="[[height]]" width$="[[width]]">
 104 |           
 105 |           <div>
 106 |             
 107 |             <input id="inputText" type="text" value$="[[text]]" hidden$="[[!effectiveShowInputField_]]" style="[[getInputTextStyle_(effectiveFontSize_)]]">
 108 |             <span id="text" hidden$="[[!effectiveShowText_]]" style="[[getTextStyle_(textBackgroundColorRgba)]]">
 109 |               [[text]]
 110 |             </span>
 111 |           </div>
 112 |         </foreignObject>
 113 |       </g>
 114 |     </svg>
 115 |   </template>
 116 |   </dom-module>
 117 | <dom-module id="loco-point" assetpath="/loco-point/">
 118 |   <template>
 119 |     <svg>
 120 |       <g id="point">
 121 |         <circle cx$="[[data.x]]" cy$="[[data.y]]" r$="[[radius_(data,scaleRatio)]]" fill$="[[color_(data)]]">
 122 |         </circle>
 123 |       </g>
 124 |     </svg>
 125 |   </template>
 126 |   </dom-module>
 127 | <dom-module id="loco-curve" assetpath="/loco-curve/">
 128 |   <template>
 129 |     <svg>
 130 |       <g id="curve">
 131 |         <path d$="[[curveString_]]" fill="none" stroke$="[[lineColor]]" stroke-width$="[[effectiveStrokeWidth_]]">
 132 |         </path>
 133 |       </g>
 134 |     </svg>
 135 |   </template>
 136 |   </dom-module><dom-module id="loco-segmentation-mask" assetpath="/loco-segmentation-mask/">
 137 |   <template>
 138 |     <svg>
 139 |       <g id="mask">
 140 |         <path d$="[[pathString_]]" fill$="[[fillColor]]" fill-rule="evenodd" fill-opacity$="[[fillOpacity]]">
 141 |       </path></g>
 142 |     </svg>
 143 |   </template>
 144 |   </dom-module>
 145 | <dom-module id="loco-label" assetpath="/loco-label/">
 146 |   <template>
 147 |     <svg>
 148 |       <g id="label">
 149 |         
 150 |         <foreignObject x$="[[data.x]]" y$="[[data.y]]" height="1px" width="1px" style="overflow: visible">
 151 |           <div>
 152 |             <input id="inputText" type="text" value$="[[data.text]]" style="[[getInputTextStyle_(editingText, scaleRatio)]]">
 153 |             <span id="text" style="[[getTextStyle_(data.*, editingText, scaleRatio)]]">
 154 |               [[data.text]]
 155 |             </span>
 156 |           </div>
 157 |         </foreignObject>
 158 |       </g>
 159 |     </svg>
 160 |   </template>
 161 |   </dom-module>
 162 | <dom-module id="loco-labeled-point" assetpath="/loco-labeled-point/">
 163 |   <template>
 164 |     
 165 |     <template is="dom-if" if="[[data.point]]">
 166 |       <loco-point data="{{data.point}}"></loco-point>
 167 | 
 168 |       
 169 |       <template is="dom-if" if="[[data.label]]">
 170 |         <loco-label editing-text="{{editingText}}" data="{{data.label}}">
 171 |         </loco-label>
 172 |       </template>
 173 |     </template>
 174 |   </template>
 175 |   </dom-module>
 176 | <dom-module id="loco-binary-mask" assetpath="/loco-binary-mask/">
 177 |   <template>
 178 |     <div hidden="">
 179 |       <canvas id="maskCanvas">
 180 |       </canvas>
 181 |     </div>
 182 |     <svg>
 183 |       <g id="binaryMask">
 184 |         <image id="rasterImage">
 185 |       </image></g>
 186 |     </svg>
 187 |   </template>
 188 |   </dom-module>
 189 | <dom-module id="loco-annotated-image" assetpath="/loco-annotated-image/">
 190 |   <template>
 191 |     <style>
 192 |       #image {
 193 |           text-align: var(--loco-annotated-image-text-align);
 194 |       }
 195 |     </style>
 196 |     <loco-image id="image" title="[[title]]" image-url="[[imageUrl]]" border-width="[[borderWidth]]" canvas-height="[[canvasHeight]]" canvas-width="[[canvasWidth]]" canvas-margin="[[canvasMargin]]" image-height="{{imageHeight}}" image-width="{{imageWidth}}" font-size="[[fontSize]]" image-data="{{imageData}}" expose-image-data="[[exposeImageData]]" image-point="{{imagePoint}}">
 197 |       
 198 |       <template is="dom-repeat" items="[[objects]]" filter="hasBoundingBox_" observe="bounding_box">
 199 |         <loco-bounding-box color="[[colorOrDefault_(item)]]" xmin="[[item.bounding_box.xmin]]" xmax="[[item.bounding_box.xmax]]" ymin="[[item.bounding_box.ymin]]" ymax="[[item.bounding_box.ymax]]" border-dash-array="[[item.bounding_box.dash_array]]" text="[[item.text]]">
 200 |         </loco-bounding-box>
 201 |       </template>
 202 | 
 203 |       
 204 |       <template is="dom-repeat" items="[[boundingBoxes]]">
 205 |         <loco-bounding-box color="[[colorOrDefault_(item)]]" border-dash-array="[[dashArrayOrDefault_(item)]]" xmin="[[item.xmin]]" xmax="[[item.xmax]]" ymin="[[item.ymin]]" ymax="[[item.ymax]]" text="[[item.text]]">
 206 |         </loco-bounding-box>
 207 |       </template>
 208 | 
 209 |       
 210 |       <template is="dom-repeat" items="[[clicksToPoints_(clicks)]]">
 211 |         <loco-point data="[[item]]">
 212 |         </loco-point>
 213 |       </template>
 214 | 
 215 |       
 216 |       <template is="dom-repeat" items="{{points}}">
 217 |         <template is="dom-if" if="[[item.x]]">
 218 |           <loco-point data="[[item]]">
 219 |           </loco-point>
 220 |         </template>
 221 |         <template is="dom-if" if="[[isLabeledPoint_(item)]]">
 222 |           <loco-labeled-point data="[[item]]">
 223 |           </loco-labeled-point>
 224 |         </template>
 225 |       </template>
 226 | 
 227 |       
 228 |       <template is="dom-repeat" items="{{labels}}">
 229 |         <loco-label data="{{item}}">
 230 |         </loco-label>
 231 |       </template>
 232 | 
 233 |       
 234 |       <template is="dom-repeat" items="[[regions]]">
 235 |         <loco-segmentation-mask polygons="{{item}}" fill-color="[[getSegmentationColor_(index)]]">
 236 |         </loco-segmentation-mask>
 237 |       </template>
 238 | 
 239 |       
 240 |       <template is="dom-repeat" items="[[curves]]">
 241 |         <loco-curve line-color="rgb(0,200,0)" line-width="1" points="{{item}}">
 242 |         </loco-curve>
 243 |       </template>
 244 | 
 245 |       
 246 |       <template is="dom-repeat" items="[[masks]]">
 247 |         <loco-binary-mask class="binaryMasks" mask="{{item}}">
 248 |         </loco-binary-mask>
 249 |       </template>
 250 | 
 251 |     </loco-image>
 252 | 
 253 |     
 254 |     <div hidden="">
 255 |       <template is="dom-if" if="[[points.length]]">
 256 |         <b>Labeled points ([[points.length]]):</b><br>
 257 |       </template>
 258 |       <template is="dom-repeat" items="[[points]]">
 259 |         <template is="dom-if" if="[[!item.point]]">
 260 |           ([[item.x]], [[item.y]])<br>
 261 |         </template>
 262 |         <template is="dom-if" if="[[item.point]]">
 263 |           <i>Label:</i> [[item.label.text]]. <i>Position:</i> ([[item.point.x]], [[item.point.y]]).
 264 |           <template is="dom-if" if="[[item.time_interval]]">
 265 |             <i>Times:</i> ([[item.time_interval.start]], [[item.time_interval.end]])
 266 |           </template>
 267 |           <br>
 268 |         </template>
 269 |       </template>
 270 |     </div>
 271 |   </template>
 272 |   </dom-module>
 273 | <style>
 274 |     body {
 275 |       font-family: 'Open Sans', sans-serif;
 276 |       line-height: 1.6;
 277 |     }
 278 | 
 279 |     table {
 280 |       width: 100%;
 281 |     }
 282 | 
 283 |     table,
 284 |     tr,
 285 |     td {
 286 |       border: none;
 287 |       border-collapse: collapse;
 288 |     }
 289 | 
 290 |     td {
 291 |       padding: 10px;
 292 |     }
 293 | 
 294 |     #audio {
 295 |       display: block;
 296 |       outline: none;
 297 |       width: 100%;
 298 |     }
 299 | 
 300 |     .card-actions {
 301 |       color: white;
 302 |       background-color: #126a73;
 303 |       border: 0;
 304 |       display: flex;
 305 |     }
 306 | 
 307 |     #visualizer {
 308 |       visibility: hidden;
 309 |       opacity: 0;
 310 |       transition: all 0.3s;
 311 |       -webkit-transition: all 0.3s;
 312 |       position: fixed;
 313 |       z-index: 1001;
 314 |       margin: 0;
 315 |       max-width: 1200px;
 316 |       width: 90%;
 317 |       top: 50%;
 318 |       left: 50%;
 319 |       transform: translate(-50%, -50%);
 320 |       box-shadow: var(--shadow-elevation-16dp_-_box-shadow);
 321 |       font-family: 'Open Sans', sans-serif;
 322 |     }
 323 | 
 324 |     #mask {
 325 |       visibility: hidden;
 326 |       opacity: 0;
 327 |       transition: all 0.3s;
 328 |       -webkit-transition: all 0.3s;
 329 |       position: fixed;
 330 |       width: 100%;
 331 |       height: 100%;
 332 |       top: 0;
 333 |       left: 0;
 334 |       z-index: 1000;
 335 |       background-color: rgb(0, 0, 0);
 336 |     }
 337 | 
 338 |     #start-mask {
 339 |       opacity: 100%;
 340 |       transition: all 0.3s;
 341 |       -webkit-transition: all 0.3s;
 342 |       position: fixed;
 343 |       width: 100%;
 344 |       height: 100%;
 345 |       top: 0;
 346 |       left: 0;
 347 |       z-index: 1000;
 348 |       background-color: white;
 349 |     }
 350 | 
 351 |     .center {
 352 |       text-align: center;
 353 |     }
 354 | 
 355 |     h1,
 356 |     h2 {
 357 |       text-align: center;
 358 |     }
 359 | 
 360 |     h1 {
 361 |       font-size: 45px;
 362 |       margin-bottom: 50px;
 363 |       margin-top: 50px;
 364 |     }
 365 | 
 366 |     h2 {
 367 |       font-size: 22px;
 368 |     }
 369 | 
 370 |     hr.separator {
 371 |       background: #000;
 372 |       max-width: 70px;
 373 |       height: 9px;
 374 |       border: none;
 375 |     }
 376 | 
 377 |     .block {
 378 |       margin-bottom: 75px;
 379 |     }
 380 | 
 381 |     .wrap {
 382 |       max-width: 900px;
 383 |       margin: auto;
 384 |     }
 385 | 
 386 |     .paper-col {
 387 |       width: 7.14%;
 388 |       float: left;
 389 |     }
 390 | 
 391 |     .paper-col-im {
 392 |       width: 98%;
 393 |       border: 1px solid black;
 394 |       margin: 2px;
 395 |     }
 396 | 
 397 |     #mask paper-spinner{
 398 |       width: 100px;
 399 |       height: 100px;
 400 |       top: 50%;
 401 |       left: 50%;
 402 |       transform: translate(-50% -50%);
 403 |       --paper-spinner-layer-1-color: white;
 404 |       --paper-spinner-layer-2-color: white;
 405 |       --paper-spinner-layer-3-color: white;
 406 |       --paper-spinner-layer-4-color: white;
 407 |       --paper-spinner-stroke-width: 10px;
 408 |     }
 409 | 
 410 |     #start-mask paper-spinner {
 411 |       width: 100px;
 412 |       height: 100px;
 413 |       top: 50%;
 414 |       left: 50%;
 415 |       transform: translate(-50% -50%);
 416 |       --paper-spinner-stroke-width: 10px;
 417 |     }
 418 | 
 419 |     #open-visualizer {
 420 |       margin-left: 0;
 421 |       margin-right: 0;
 422 |     }
 423 | 
 424 |     .page-button {
 425 |       font-family: 'Open Sans', sans-serif;
 426 |       width: 100%;
 427 |       background-color: #126a73;
 428 |       color: white;
 429 |       margin-left: 0;
 430 |       margin-right: 0;
 431 |     }
 432 | 
 433 |     paper-button:hover {
 434 |       background-color: #13a3ac;
 435 |     }
 436 | 
 437 |     paper-button {
 438 |       text-transform: none;
 439 |       transition: background-color 0.3s;
 440 |     }
 441 | 
 442 |     .col-download-text {
 443 |       margin-top: 10px;
 444 |       width: 15%;
 445 |       float: left;
 446 |     }
 447 | 
 448 |     .col-download-buttons {
 449 |       width: 85%;
 450 |       float: left;
 451 |       display: flex;
 452 |     }
 453 | 
 454 |     .col-button {
 455 |       margin: 0;
 456 |       padding-left: 5px;
 457 |       padding-right: 5px;
 458 |       float: left;
 459 |     }
 460 | 
 461 |     .row::after {
 462 |       content: "";
 463 |       clear: both;
 464 |       display: table;
 465 |       margin-bottom: 8px;
 466 |     }
 467 | 
 468 |     #subfiles-oid-train,
 469 |     #subfiles-coco-train {
 470 |       margin: 0;
 471 |       display: flex;
 472 |       padding: 10px;
 473 |     }
 474 | 
 475 | 
 476 |     .caption-card {
 477 |       font-family: 'Open Sans', sans-serif;
 478 |       border: 2px solid #f1f3f4;
 479 |       padding: 0;
 480 |       width: 100%;
 481 |       border-radius: 9px;
 482 |       display: flex;
 483 |       box-shadow: none;
 484 |       overflow: hidden;
 485 |     }
 486 | 
 487 |     .caption-title {
 488 |       width: 20%;
 489 |       max-width: 100px;
 490 |       float: left;
 491 |       background-color: #f1f3f4;
 492 |       padding: 3px 10px;
 493 |     }
 494 | 
 495 |     .caption-content {
 496 |       float: left;
 497 |       padding: 3px 10px;
 498 |     }
 499 | 
 500 |     .icon {
 501 |       width: 26px;
 502 |       height: 26px;
 503 |     }
 504 | 
 505 |     .close-button {
 506 |       padding: 5px;
 507 |       position: absolute;
 508 |       right: -16px;
 509 |       top: -16px;
 510 |       background-color: #126a73;
 511 |       border-radius: 25px;
 512 |       color: white;
 513 |       box-shadow: var(--shadow-elevation-16dp_-_box-shadow);
 514 |       z-index: 1002;
 515 |     }
 516 | 
 517 |     .field{
 518 |       background-color: #eee;
 519 |       border-radius: 4px;
 520 |       padding: 3px;
 521 |       border-left: 3px solid #126a73;
 522 |     }
 523 | 
 524 |     .code{
 525 |       background-color: #eee;
 526 |       border-radius: 4px;
 527 |       padding: 15px;
 528 |       border-left: 3px solid #126a73;
 529 |     }
 530 | 
 531 |     #close-formats-button {
 532 |       background-color: white;
 533 |       color: darkgray;
 534 |       top: -42px;
 535 |     }
 536 | 
 537 |     #formats li{
 538 |       margin-bottom: 10px;
 539 |     }
 540 | 
 541 |     #open-visualizer-image {
 542 |       width: 70%;
 543 |       margin-left: auto;
 544 |       margin-right: auto;
 545 |       margin-bottom: 20px;
 546 |       display: flex;
 547 |       overflow: hidden;
 548 |       box-shadow: none;
 549 |       border-radius: 5px;
 550 |     }
 551 | 
 552 |     #explore-overlay {
 553 |       width: 100%;
 554 |       height: 100%;
 555 |       position: absolute;
 556 |       background: #126a73;
 557 |       opacity: 0;
 558 |       transition-duration: 0.3s;
 559 |       cursor: pointer;
 560 |       display: flex;
 561 |     }
 562 | 
 563 |     #explore-overlay:hover {
 564 |       opacity: 50%;
 565 |     }
 566 | 
 567 |     #explore-overlay-text {
 568 |       color: white;
 569 |       margin: auto;
 570 |       font-size: 200%;
 571 |     }
 572 |   </style></div>
 573 | <div class="wrap">
 574 |   <h2 style="margin-bottom: 0; margin-top: 100px;">Connecting Vision and Language with</h2>
 575 |   <h1 style="margin-bottom: 80px; margin-top: 0">Localized Narratives</h1>
 576 |   <hr class="separator">
 577 |   <h2>Publication</h2>
 578 |   <a href="https://arxiv.org/pdf/1912.03098.pdf" target="_blank">
 579 |     <div class="row">
 580 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_1.jpeg"></div>
 581 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_2.jpeg"></div>
 582 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_3.jpeg"></div>
 583 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_4.jpeg"></div>
 584 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_5.jpeg"></div>
 585 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_6.jpeg"></div>
 586 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_7.jpeg"></div>
 587 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_8.jpeg"></div>
 588 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_9.jpeg"></div>
 589 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_10.jpeg"></div>
 590 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_11.jpeg"></div>
 591 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_12.jpeg"></div>
 592 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_13.jpeg"></div>
 593 |       <div class="paper-col"><img class="paper-col-im" src="images/paper_thumb_14.jpeg"></div>
 594 |     </div>
 595 |   </a>
 596 | 
 597 |   <div class="center block" style="margin-top: 10px;"><b>Connecting Vision and Language with
 598 |     Localized Narratives</b><br>
 599 |     Jordi Pont-Tuset, Jasper Uijlings, Soravit Changpinyo, Radu Soricut, and Vittorio Ferrari<br>
 600 |     ECCV (Spotlight), 2020<br>
 601 |     [<a href="https://arxiv.org/pdf/1912.03098.pdf" target="_blank">PDF</a>] [<a href="javascript:document.getElementById('bibtex').open();">BibTeX</a>] [<a href="https://youtu.be/_MDFe-o8qyA" target="_blank">1'30'' video</a>] [<a href="https://youtu.be/AjuL3ljkt3Y" target="_blank">10' video</a>]
 602 |   </div>
 603 |   <paper-dialog entry-animation="fade-in-animation" exit-animation="fade-out-animation" id="bibtex" with-backdrop=""><pre>@inproceedings{PontTuset_eccv2020,
 604 |   author    = {Jordi Pont-Tuset and Jasper Uijlings and Soravit Changpinyo and Radu Soricut and Vittorio Ferrari},
 605 |   title     = {Connecting Vision and Language with Localized Narratives},
 606 |   booktitle = {ECCV},
 607 |   year      = {2020}
 608 | }</pre>
 609 |   </paper-dialog>
 610 |   <hr class="separator">
 611 |   <h2>Abstract</h2>
 612 |   <div class="center block">
 613 | We propose Localized Narratives, a new form of multimodal image annotations connecting vision and
 614 |     language. We ask annotators to describe an image with their voice while simultaneously hovering
 615 |     their mouse over the region they are describing. Since the voice and the mouse pointer are
 616 |     synchronized, we can localize every single word in the description. This dense visual grounding
 617 |     takes the form of a mouse trace segment per word and is unique to our data. We annotated 849k
 618 |     images with Localized Narratives: the whole COCO, Flickr30k, and ADE20K datasets, and 671k
 619 |     images of Open Images, all of which we make publicly available. We provide an extensive
 620 |     analysis of these annotations showing they are diverse, accurate, and efficient to produce.
 621 |     We also demonstrate their utility on the application of controlled image captioning.
 622 |   </div>
 623 |   <hr class="separator">
 624 |   <h2>Explore Localized Narratives</h2>
 625 |   <div class="center" style="margin-bottom: 20px; max-width: 600px; margin-left: auto; margin-right: auto;">
 626 |     Explore some images and play the Localized Narrative annotation: synchronized voice, caption,
 627 |     and mouse trace. Don't forget to turn the sound on!
 628 |   </div>
 629 |   <paper-card id="open-visualizer-image">
 630 |     <img src="images/loc-narr.gif">
 631 |     <div id="explore-overlay">
 632 |       <div id="explore-overlay-text">Explore&nbsp;<iron-icon icon="exit-to-app" style="--iron-icon-height: 30px;--iron-icon-width: 30px;"></iron-icon>
 633 |       </div>
 634 |     </div>
 635 |   </paper-card>
 636 |   <paper-button class="page-button block" id="open-visualizer">Explore</paper-button>
 637 |   <hr class="separator">
 638 |   <h2>License</h2>
 639 |   <div class="center block">
 640 |     All the annotations available through this website are released under a <a href="https://creativecommons.org/licenses/by/4.0/" target="_blank">CC BY 4.0</a> license.
 641 |     You are free to redistribute and modify the annotations, but we ask you to please keep the original attribution to our paper.
 642 |   </div>
 643 |   <hr class="separator">
 644 |   <h2>Code</h2>
 645 |   <div class="block">
 646 |     <div class="center">
 647 |       <div style="margin-bottom: 10px; margin-top:30px;"><b>Python Data Loader and Helpers</b></div>
 648 |         <div style="max-width: 680px; margin-left: auto; margin-right: auto;">
 649 |           Visit the <a href="https://github.com/google/localized-narratives" target="_blank">GitHub repository</a>
 650 |           to view the code to download and work with Localized
 651 |           Narratives.<br>Here is the <span id="open-formats-link-1" style="text-decoration: underline;cursor: pointer;">documentation</span>
 652 |           about the file formats used.<br>
 653 | 					Alternatively, you can manually download the data below.
 654 |       </div>
 655 |       <div style="margin-bottom: 10px; margin-top:30px;"><b>From Traces to Boxes</b></div>
 656 |         <div style="max-width: 680px; margin-left: auto; margin-right: auto;">
 657 |           This <a href="https://colab.research.google.com/drive/1WmOrRsWul4AAwUQbfGswmRqELP8OJiBf" target="_blank">colab</a>
 658 |           demonstrates how we get from a trace segment to a bounding box.
 659 |       </div>
 660 |     </div>
 661 |   </div>
 662 |   <hr class="separator">
 663 |   <h2>Downloads</h2>
 664 |   <div class="block">
 665 |     <div class="center">
 666 |       <div style="margin-bottom: 10px; margin-top: 45px;"><b>Full Localized Narratives</b></div>
 667 |               <div style="max-width: 730px; margin-left: auto; margin-right: auto;">
 668 |                 Here you can download the full set of Localized Narratives (<span id="open-formats-link-2" style="text-decoration: underline;cursor: pointer;">format description</span>).
 669 |                 <br>Large files are split in shards (a list of them will appear when you click
 670 |                 below).
 671 |                 <br>In parantheses, the number of Localized Narratives in each split. Please note that some images have more than one Localized Narrative annotation, e.g. 5k images in COCO are annotated 5 times.
 672 |               </div>
 673 |       <paper-dialog entry-animation="fade-in-animation" exit-animation="fade-out-animation" id="formats" with-backdrop="">
 674 |         <paper-icon-button class="close-button" icon="clear" id="close-formats-button"></paper-icon-button>
 675 |         <div style="max-width: 750px; text-align: left;">
 676 |           <h2>File formats</h2>
 677 |           <p> The annotations are in <a href="http://jsonlines.org/" target="_blank">JSON Lines</a> format,
 678 |             that is, each line of the file is an independent valid JSON-encoded object. The largest
 679 |             files are split into smaller sub-files (shards) for ease of download. Since each
 680 |             line of the file is independent, the whole file can be reconstructed by simply
 681 |             concatenating the contents of the shards.</p>
 682 |           <p>Each line represents one Localized Narrative annotation on one image by one annotator
 683 |             and has the following fields:</p>
 684 |             <ul>
 685 |               <li><tt class="field">dataset_id</tt> String identifying the dataset and split where
 686 |                 the image belongs, e.g. <tt>mscoco_val2017</tt>.</li>
 687 |               <li><tt class="field">image_id</tt> String identifier of the image, as specified on
 688 |               each dataset.</li>
 689 |               <li><tt class="field">annotator_id</tt> Integer number uniquely identifying each annotator.</li>
 690 |               <li><tt class="field">caption</tt> Image caption as a string of characters.</li>
 691 |               <li><tt class="field">timed_caption</tt> List of timed utterances, i.e. <tt>{utterance, start_time, end_time}</tt>
 692 |                 where <tt>utterance</tt> is a word (or group of words) and <tt>(start_time, end_time)</tt> is
 693 |               the time during which it was spoken, with respect to the start of the recording.</li>
 694 |               <li><tt class="field">traces</tt> List of trace segments, one between each time the mouse
 695 |               pointer enters the image and goes away from it. Each trace segment is represented as a list
 696 |               of timed points, i.e. <tt>{x, y, t}</tt>, where <tt>x</tt> and <tt>y</tt> are the normalized
 697 |               image coordinates (with origin at the top-left corner of the image) and <tt>t</tt> is
 698 |                 the time in seconds since the start of the recording.
 699 |                 Please note that the coordinates can go a bit beyond the image, i.e. <tt>&lt;0</tt>
 700 |                 or <tt>&gt;1</tt>, as we recorded the mouse traces including a small band around the image.</li>
 701 |               <li><tt class="field">voice_recording</tt> Relative URL path with respect to
 702 |                 <tt>https://storage.googleapis.com/localized-narratives/voice-recordings</tt>
 703 |                 where to find the voice recording (in
 704 |                 <a href="https://en.wikipedia.org/wiki/Ogg" target="_blank">OGG</a> format) for that particular image. </li>
 705 |             </ul>
 706 |           <p>Below a sample of one Localized Narrative in this format:</p>
 707 |           <pre class="code">{
 708 |   dataset_id: 'mscoco_val2017',
 709 |   image_id: '137576',
 710 |   annotator_id: 93,
 711 |   caption: 'In this image there are group of cows standing and eating th...',
 712 |   timed_caption: [{'utterance': 'In this', 'start_time': 0.0, 'end_time': 0.4}, ...],
 713 |   traces: [[{'x': 0.2086, 'y': -0.0533, 't': 0.022}, ...], ...],
 714 |   voice_recording: 'coco_val/coco_val_137576_93.ogg'
 715 | }</pre>
 716 |         </div>
 717 |       </paper-dialog>
 718 |     </div>
 719 |     <div class="row" style="margin-top: 15px;">
 720 |       <div class="col-download-text">
 721 |         Open Images
 722 |       </div>
 723 |       <div class="col-download-buttons">
 724 |         <div class="col-button" style="width: 33%;">
 725 |           <paper-button class="page-button" onclick="document.getElementById('subfiles-oid-train').positionTarget = this;document.getElementById('subfiles-oid-train').open();">
 726 |             Train (507,444)
 727 |           </paper-button>
 728 |           <paper-dialog entry-animation="fade-in-animation" exit-animation="fade-out-animation" horizontal-align="left" id="subfiles-oid-train" no-overlap="" vertical-align="top">
 729 |             <div class="col-button">
 730 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00000-of-00010.jsonl
 731 | " style="text-decoration: none;">
 732 |                 <paper-button class="page-button" style="min-width: 45px;">1</paper-button>
 733 |               </a>
 734 |             </div>
 735 |             <div class="col-button">
 736 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00001-of-00010.jsonl
 737 | " style="text-decoration: none;">
 738 |                 <paper-button class="page-button" style="min-width: 45px;">2</paper-button>
 739 |               </a>
 740 |             </div>
 741 |             <div class="col-button">
 742 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00002-of-00010.jsonl
 743 | " style="text-decoration: none;">
 744 |                 <paper-button class="page-button" style="min-width: 45px;">3</paper-button>
 745 |               </a>
 746 |             </div>
 747 |             <div class="col-button">
 748 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00003-of-00010.jsonl
 749 | " style="text-decoration: none;">
 750 |                 <paper-button class="page-button" style="min-width: 45px;">4</paper-button>
 751 |               </a>
 752 |             </div>
 753 |             <div class="col-button">
 754 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00004-of-00010.jsonl
 755 | " style="text-decoration: none;">
 756 |                 <paper-button class="page-button" style="min-width: 45px;">5</paper-button>
 757 |               </a>
 758 |             </div>
 759 |             <div class="col-button">
 760 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00005-of-00010.jsonl
 761 | " style="text-decoration: none;">
 762 |                 <paper-button class="page-button" style="min-width: 45px;">6</paper-button>
 763 |               </a>
 764 |             </div>
 765 |             <div class="col-button">
 766 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00006-of-00010.jsonl
 767 | " style="text-decoration: none;">
 768 |                 <paper-button class="page-button" style="min-width: 45px;">7</paper-button>
 769 |               </a>
 770 |             </div>
 771 |             <div class="col-button">
 772 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00007-of-00010.jsonl
 773 | " style="text-decoration: none;">
 774 |                 <paper-button class="page-button" style="min-width: 45px;">8</paper-button>
 775 |               </a>
 776 |             </div>
 777 |             <div class="col-button">
 778 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00008-of-00010.jsonl
 779 | " style="text-decoration: none;">
 780 |                 <paper-button class="page-button" style="min-width: 45px;">9</paper-button>
 781 |               </a>
 782 |             </div>
 783 |             <div class="col-button">
 784 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_localized_narratives-00009-of-00010.jsonl
 785 | " style="text-decoration: none;">
 786 |                 <paper-button class="page-button" style="min-width: 45px;">10</paper-button>
 787 |               </a>
 788 |             </div>
 789 |           </paper-dialog>
 790 |         </div>
 791 |         <div class="col-button" style="width: 33%;">
 792 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_validation_localized_narratives.jsonl" style="text-decoration: none;">
 793 |             <paper-button class="page-button">Validation (41,691)</paper-button>
 794 |           </a>
 795 |         </div>
 796 |         <div class="col-button" style="width: 33%;">
 797 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_test_localized_narratives.jsonl" style="text-decoration: none;">
 798 |             <paper-button class="page-button">Test (126,020)</paper-button>
 799 |           </a>
 800 |         </div>
 801 |       </div>
 802 |     </div>
 803 |     <div class="row">
 804 |       <div class="col-download-text">
 805 |         COCO
 806 |       </div>
 807 |       <div class="col-download-buttons">
 808 |         <div class="col-button" style="width: 33%;">
 809 |           <paper-button class="page-button" onclick="document.getElementById('subfiles-coco-train').positionTarget = this;document.getElementById('subfiles-coco-train').open();">
 810 |             Train (134,272)
 811 |           </paper-button>
 812 |           <paper-dialog entry-animation="fade-in-animation" exit-animation="fade-out-animation" horizontal-align="left" id="subfiles-coco-train" no-overlap="" vertical-align="top">
 813 |             <div class="col-button">
 814 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_train_localized_narratives-00000-of-00004.jsonl
 815 | " style="text-decoration: none;">
 816 |                 <paper-button class="page-button" style="min-width: 45px;">1</paper-button>
 817 |               </a>
 818 |             </div>
 819 |             <div class="col-button">
 820 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_train_localized_narratives-00001-of-00004.jsonl
 821 | " style="text-decoration: none;">
 822 |                 <paper-button class="page-button" style="min-width: 45px;">2</paper-button>
 823 |               </a>
 824 |             </div>
 825 |             <div class="col-button">
 826 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_train_localized_narratives-00002-of-00004.jsonl
 827 | " style="text-decoration: none;">
 828 |                 <paper-button class="page-button" style="min-width: 45px;">3</paper-button>
 829 |               </a>
 830 |             </div>
 831 |             <div class="col-button">
 832 |               <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_train_localized_narratives-00003-of-00004.jsonl
 833 | " style="text-decoration: none;">
 834 |                 <paper-button class="page-button" style="min-width: 45px;">4</paper-button>
 835 |               </a>
 836 |             </div>
 837 |           </paper-dialog>
 838 |         </div>
 839 |         <div class="col-button" style="width: 33%;">
 840 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_val_localized_narratives.jsonl" style="text-decoration: none;">
 841 |             <paper-button class="page-button">Validation (8,573)</paper-button>
 842 |           </a>
 843 |         </div>
 844 |         <div class="col-button" style="width: 33%;">
 845 |         </div>
 846 |       </div>
 847 |     </div>
 848 |     <div class="row">
 849 |       <div class="col-download-text">
 850 |         Flickr30k
 851 |       </div>
 852 |       <div class="col-download-buttons">
 853 |         <div class="col-button" style="width: 33%;">
 854 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_train_localized_narratives.jsonl" style="text-decoration: none;">
 855 |             <paper-button class="page-button">Train (30,546)</paper-button>
 856 |           </a>
 857 |         </div>
 858 |         <div class="col-button" style="width: 33%;">
 859 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_val_localized_narratives.jsonl" style="text-decoration: none;">
 860 |             <paper-button class="page-button">Validation (1,009)</paper-button>
 861 |           </a>
 862 |         </div>
 863 |         <div class="col-button" style="width: 33%;">
 864 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_test_localized_narratives.jsonl" style="text-decoration: none;">
 865 |             <paper-button class="page-button">Test (1,023)</paper-button>
 866 |           </a>
 867 |         </div>
 868 |       </div>
 869 |     </div>
 870 |     <div class="row">
 871 |       <div class="col-download-text">
 872 |         ADE20k
 873 |       </div>
 874 |       <div class="col-download-buttons">
 875 |         <div class="col-button" style="width: 33%;">
 876 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/ade20k_train_localized_narratives.jsonl" style="text-decoration: none;">
 877 |             <paper-button class="page-button">Train (20,476)</paper-button>
 878 |           </a>
 879 |         </div>
 880 |         <div class="col-button" style="width: 33%;">
 881 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/ade20k_validation_localized_narratives.jsonl" style="text-decoration: none;">
 882 |             <paper-button class="page-button">Validation (2,053)</paper-button>
 883 |           </a>
 884 |         </div>
 885 |         <div class="col-button" style="width: 33%;">
 886 |         </div>
 887 |       </div>
 888 |     </div>
 889 |     <div class="center">
 890 |       <div style="margin-bottom: 10px; margin-top: 45px;"><b>Textual captions only</b></div>
 891 |               <div style="max-width: 680px; margin-left: auto; margin-right: auto;">
 892 |       To facilitate download, below are the annotations on the same images as above but containing
 893 |       only the textual caption, in case you are only interested in this part of Localized
 894 |       Narratives.
 895 |       </div>
 896 |     </div>
 897 |     <div class="row" style="margin-top: 15px;">
 898 |       <div class="col-download-text">
 899 |         Open Images
 900 |       </div>
 901 |       <div class="col-download-buttons">
 902 |         <div class="col-button" style="width: 33%;">
 903 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_captions.jsonl" style="text-decoration: none;">
 904 |             <paper-button class="page-button">Train (507,444)</paper-button>
 905 |           </a>
 906 |         </div>
 907 |         <div class="col-button" style="width: 33%;">
 908 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_validation_captions.jsonl" style="text-decoration: none;">
 909 |             <paper-button class="page-button">Validation (41,691)</paper-button>
 910 |           </a>
 911 |         </div>
 912 |         <div class="col-button" style="width: 33%;">
 913 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_test_captions.jsonl" style="text-decoration: none;">
 914 |             <paper-button class="page-button">Test (126,020)</paper-button>
 915 |           </a>
 916 |         </div>
 917 |       </div>
 918 |     </div>
 919 |     <div class="row">
 920 |       <div class="col-download-text">
 921 |         COCO
 922 |       </div>
 923 |       <div class="col-download-buttons">
 924 |         <div class="col-button" style="width: 33%;">
 925 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_train_captions.jsonl" style="text-decoration: none;">
 926 |             <paper-button class="page-button">Train (134,272)</paper-button>
 927 |           </a>
 928 |         </div>
 929 |         <div class="col-button" style="width: 33%;">
 930 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_val_captions.jsonl" style="text-decoration: none;">
 931 |             <paper-button class="page-button">Validation (8,573)</paper-button>
 932 |           </a>
 933 |         </div>
 934 |         <div class="col-button" style="width: 33%;">
 935 |         </div>
 936 |       </div>
 937 |     </div>
 938 |     <div class="row">
 939 |       <div class="col-download-text">
 940 |         Flickr30k
 941 |       </div>
 942 |       <div class="col-download-buttons">
 943 |         <div class="col-button" style="width: 33%;">
 944 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_train_captions.jsonl" style="text-decoration: none;">
 945 |             <paper-button class="page-button">Train (30,546)</paper-button>
 946 |           </a>
 947 |         </div>
 948 |         <div class="col-button" style="width: 33%;">
 949 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_val_captions.jsonl" style="text-decoration: none;">
 950 |             <paper-button class="page-button">Validation (1,009)</paper-button>
 951 |           </a>
 952 |         </div>
 953 |         <div class="col-button" style="width: 33%;">
 954 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_test_captions.jsonl" style="text-decoration: none;">
 955 |             <paper-button class="page-button">Test (1,023)</paper-button>
 956 |           </a>
 957 |         </div>
 958 |       </div>
 959 |     </div>
 960 |     <div class="row">
 961 |       <div class="col-download-text">
 962 |         ADE20k
 963 |       </div>
 964 |       <div class="col-download-buttons">
 965 |         <div class="col-button" style="width: 33%;">
 966 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/ade20k_train_captions.jsonl" style="text-decoration: none;">
 967 |             <paper-button class="page-button">Train (20,476)</paper-button>
 968 |           </a>
 969 |         </div>
 970 |         <div class="col-button" style="width: 33%;">
 971 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/ade20k_validation_captions.jsonl" style="text-decoration: none;">
 972 |             <paper-button class="page-button">Validation (2,053)</paper-button>
 973 |           </a>
 974 |         </div>
 975 |         <div class="col-button" style="width: 33%;">
 976 |         </div>
 977 |       </div>
 978 |     </div>
 979 |     <div class="center">
 980 |       <div style="margin-bottom: 10px; margin-top: 45px;"><b>Automatic speech-to-text transcriptions</b></div>
 981 |       <div style="max-width: 680px; margin-left: auto; margin-right: auto;">
 982 |         <p>Below you can download the <a href="https://cloud.google.com/speech-to-text" target="_blank">automatic speech-to-text</a>
 983 |           transcriptions from the voice recordings.
 984 |           The format is a list of text chunks, each of which is a list of ten alternatives along with its confidence.</p>
 985 |         <p></p><b>Please note</b>: the final caption text of Localized Narratives is given manually by the annotators.
 986 |         The automatic transcriptions below are only used to temporally align the manual transcription to the mouse traces.
 987 |         The timestamps used for this, though, were not stored, so the alignment process cannot be reproduced.
 988 |         To have some timestamps, you'd need to re-run Google's speech-to-text transcription
 989 |         (<a target="_blank" href="https://github.com/google/localized-narratives/blob/master/transcription_example.py">here</a> the code we used).
 990 |         Given that the API is constantly evolving, though, the transcription will likely not match the one stored below.<p></p>
 991 |       </div>
 992 |     </div>
 993 |     <div class="row" style="margin-top: 15px;">
 994 |       <div class="col-download-text">
 995 |         Open Images
 996 |       </div>
 997 |       <div class="col-download-buttons">
 998 |         <div class="col-button" style="width: 33%;">
 999 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_train_v6_asr.jsonl" style="text-decoration: none;">
1000 |             <paper-button class="page-button">Train (507,444)</paper-button>
1001 |           </a>
1002 |         </div>
1003 |         <div class="col-button" style="width: 33%;">
1004 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_validation_asr.jsonl" style="text-decoration: none;">
1005 |             <paper-button class="page-button">Validation (41,691)</paper-button>
1006 |           </a>
1007 |         </div>
1008 |         <div class="col-button" style="width: 33%;">
1009 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/open_images_test_asr.jsonl" style="text-decoration: none;">
1010 |             <paper-button class="page-button">Test (126,020)</paper-button>
1011 |           </a>
1012 |         </div>
1013 |       </div>
1014 |     </div>
1015 |     <div class="row">
1016 |       <div class="col-download-text">
1017 |         COCO
1018 |       </div>
1019 |       <div class="col-download-buttons">
1020 |         <div class="col-button" style="width: 33%;">
1021 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_train_asr.jsonl" style="text-decoration: none;">
1022 |             <paper-button class="page-button">Train (134,272)</paper-button>
1023 |           </a>
1024 |         </div>
1025 |         <div class="col-button" style="width: 33%;">
1026 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/coco_val_asr.jsonl" style="text-decoration: none;">
1027 |             <paper-button class="page-button">Validation (8,573)</paper-button>
1028 |           </a>
1029 |         </div>
1030 |         <div class="col-button" style="width: 33%;">
1031 |         </div>
1032 |       </div>
1033 |     </div>
1034 |     <div class="row">
1035 |       <div class="col-download-text">
1036 |         Flickr30k
1037 |       </div>
1038 |       <div class="col-download-buttons">
1039 |         <div class="col-button" style="width: 33%;">
1040 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_train_asr.jsonl" style="text-decoration: none;">
1041 |             <paper-button class="page-button">Train (30,546)</paper-button>
1042 |           </a>
1043 |         </div>
1044 |         <div class="col-button" style="width: 33%;">
1045 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_val_asr.jsonl" style="text-decoration: none;">
1046 |             <paper-button class="page-button">Validation (1,009)</paper-button>
1047 |           </a>
1048 |         </div>
1049 |         <div class="col-button" style="width: 33%;">
1050 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/flickr30k_test_asr.jsonl" style="text-decoration: none;">
1051 |             <paper-button class="page-button">Test (1,023)</paper-button>
1052 |           </a>
1053 |         </div>
1054 |       </div>
1055 |     </div>
1056 |     <div class="row">
1057 |       <div class="col-download-text">
1058 |         ADE20k
1059 |       </div>
1060 |       <div class="col-download-buttons">
1061 |         <div class="col-button" style="width: 33%;">
1062 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/ade20k_train_asr.jsonl" style="text-decoration: none;">
1063 |             <paper-button class="page-button">Train (20,476)</paper-button>
1064 |           </a>
1065 |         </div>
1066 |         <div class="col-button" style="width: 33%;">
1067 |           <a href="https://storage.googleapis.com/localized-narratives/annotations/ade20k_validation_asr.jsonl" style="text-decoration: none;">
1068 |             <paper-button class="page-button">Validation (2,053)</paper-button>
1069 |           </a>
1070 |         </div>
1071 |         <div class="col-button" style="width: 33%;">
1072 |         </div>
1073 |       </div>
1074 |     </div>
1075 |   </div>
1076 |   <hr class="separator" style="margin-bottom: 100px;">
1077 | 
1078 |   
1079 |   <div id="start-mask">
1080 |     <paper-spinner active="" id="start-spinner"></paper-spinner>
1081 |   </div>
1082 |   <div id="mask">
1083 |     <paper-spinner active="" id="spinner"></paper-spinner>
1084 |   </div>
1085 |   <paper-card id="visualizer">
1086 |     <paper-icon-button class="close-button" icon="clear" id="close-visualizer-button"></paper-icon-button>
1087 |     <div class="card-actions">
1088 |       <paper-button id="previous">
1089 |         <iron-icon class="icon" icon="chevron-left"></iron-icon>
1090 |         Previous
1091 |       </paper-button>
1092 |       <paper-button id="random">Random</paper-button>
1093 |       <paper-button id="next">Next
1094 |         <iron-icon class="icon" icon="chevron-right"></iron-icon>
1095 |       </paper-button>
1096 |     </div>
1097 |     <div class="card-content">
1098 |       <audio controls="" id="audio">
1099 |       </audio>
1100 |       <loco-annotated-image border-width="0" canvas-height="600" canvas-margin="20" id="display">
1101 |       </loco-annotated-image>
1102 |       <paper-card class="caption-card" style="min-height: 6em; margin-bottom: 10px;">
1103 |         <div class="caption-title">Caption</div>
1104 |         <div class="caption-content"><span id="timed-caption"></span></div>
1105 |       </paper-card>
1106 |       <paper-card class="caption-card" style="font-size: 95%;">
1107 |         <div class="caption-title">Metadata</div>
1108 |         <div class="caption-content">
1109 |           Image source: <a id="image-title" target="_blank"></a>.
1110 |           Author: <a id="image-author" target="_blank"></a>.
1111 |           <a id="image-license" target="_blank">Image license</a>.
1112 |           <br>
1113 |           Dataset: <a href="https://g.co/dataset/openimages" target="_blank">Open Images</a>.
1114 |           ID: <span id="image-id"></span>.
1115 |           <a id="recording-file" target="_blank">Recording file</a>.
1116 |         </div>
1117 |       </paper-card>
1118 |       <div id="automatic-transcription" style="visibility: hidden;">
1119 |       </div>
1120 |     </div>
1121 |   </paper-card>
1122 | </div>
1123 | 
1124 | 
1125 | 
1126 | <script src="web.js"></script></body></html>


--------------------------------------------------------------------------------
/localized_narratives.py:
--------------------------------------------------------------------------------
  1 | # python3
  2 | # coding=utf-8
  3 | # Copyright 2020 The Google Research Authors.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """Data Loader for Localized Narratives."""
 17 | 
 18 | import json
 19 | import os
 20 | import re
 21 | from typing import Dict, Generator, List, NamedTuple
 22 | import wget  # type: ignore
 23 | 
 24 | 
 25 | _ROOT_URL = 'https://storage.googleapis.com/localized-narratives'
 26 | _ANNOTATIONS_ROOT_URL = f'{_ROOT_URL}/annotations'
 27 | _RECORDINGS_ROOT_URL = f'{_ROOT_URL}/voice-recordings'
 28 | 
 29 | _ANNOTATION_FILES = {
 30 |     'open_images_train': [
 31 |         f'open_images_train_v6_localized_narratives-{i:05d}-of-00010.jsonl'
 32 |         for i in range(10)
 33 |     ],
 34 |     'open_images_val': ['open_images_validation_localized_narratives.jsonl'],
 35 |     'open_images_test': ['open_images_test_localized_narratives.jsonl'],
 36 |     'coco_train': [
 37 |         f'coco_train_localized_narratives-{i:05d}-of-00004.jsonl'
 38 |         for i in range(4)
 39 |     ],
 40 |     'coco_val': ['coco_val_localized_narratives.jsonl'],
 41 |     'flickr30k_train': ['flickr30k_train_localized_narratives.jsonl'],
 42 |     'flickr30k_val': ['flickr30k_val_localized_narratives.jsonl'],
 43 |     'flickr30k_test': ['flickr30k_test_localized_narratives.jsonl'],
 44 |     'ade20k_train': ['ade20k_train_localized_narratives.jsonl'],
 45 |     'ade20k_val': ['ade20k_validation_localized_narratives.jsonl']
 46 | }  # type: Dict[str, List[str, ...]]]
 47 | 
 48 | 
 49 | class TimedPoint(NamedTuple):
 50 |   x: float
 51 |   y: float
 52 |   t: float
 53 | 
 54 | 
 55 | class TimedUtterance(NamedTuple):
 56 |   utterance: str
 57 |   start_time: float
 58 |   end_time: float
 59 | 
 60 | 
 61 | class LocalizedNarrative(NamedTuple):
 62 |   """Represents a Localized Narrative annotation.
 63 | 
 64 |   Visit https://google.github.io/localized-narratives/index.html?file-formats=1
 65 |   for the documentation of each field.
 66 |   """
 67 |   dataset_id: str
 68 |   image_id: str
 69 |   annotator_id: int
 70 |   caption: str
 71 |   timed_caption: List[TimedUtterance]
 72 |   traces: List[List[TimedPoint]]
 73 |   voice_recording: str
 74 | 
 75 |   @property
 76 |   def voice_recording_url(self) -> str:
 77 |     """Returns the absolute path where to find the voice recording file."""
 78 |     # Fixes the voice recording path for Flickr30K and ADE20k
 79 |     if 'Flic' in self.dataset_id or 'ADE' in self.dataset_id:
 80 |       split_id, image_id = re.search(r'(\w+)/\w+_([0-9]+)_[0-9]+\.',
 81 |                                      self.voice_recording).groups()
 82 |       image_id = image_id.zfill(16)
 83 |       voice_recording = (f'{split_id}/'
 84 |                          f'{split_id}_{image_id}_{self.annotator_id}.ogg')
 85 |     else:
 86 |       voice_recording = self.voice_recording
 87 | 
 88 |     return f'{_RECORDINGS_ROOT_URL}/{voice_recording}'
 89 | 
 90 |   def __repr__(self):
 91 |     truncated_caption = self.caption[:60] + '...' if len(
 92 |         self.caption) > 63 else self.caption
 93 |     truncated_timed_caption = self.timed_caption[0].__str__()
 94 |     truncated_traces = self.traces[0][0].__str__()
 95 |     return (f'{{\n'
 96 |             f' dataset_id: {self.dataset_id},\n'
 97 |             f' image_id: {self.image_id},\n'
 98 |             f' annotator_id: {self.annotator_id},\n'
 99 |             f' caption: {truncated_caption},\n'
100 |             f' timed_caption: [{truncated_timed_caption}, ...],\n'
101 |             f' traces: [[{truncated_traces}, ...], ...],\n'
102 |             f' voice_recording: {self.voice_recording}\n'
103 |             f'}}')
104 | 
105 | 
106 | def _expected_files(dataset_and_split: str) -> Generator[str, None, None]:
107 |   try:
108 |     yield from _ANNOTATION_FILES[dataset_and_split]
109 |   except KeyError:
110 |     raise ValueError(
111 |         f'Unknown value for `dataset_and_split`: {dataset_and_split}')
112 | 
113 | 
114 | class DataLoader:
115 |   """Data Loader for Localized Narratives."""
116 | 
117 |   def __init__(self, local_root_dir: str):
118 |     """DataLoader constructor.
119 | 
120 |     Args:
121 |       local_root_dir: Local directory where the annotation files can be
122 |         downloaded to and read from.
123 |     """
124 |     self._local_root_dir = local_root_dir
125 |     self._current_open_file = None
126 | 
127 |   def download_annotations(self, dataset_and_split: str):
128 |     """Downloads the Localized Narratives annotations.
129 | 
130 |     Args:
131 |       dataset_and_split: Name of the dataset and split to download.
132 |         Possible values are the keys in _ANNOTATION_FILES.
133 |     """
134 |     os.makedirs(self._local_root_dir, exist_ok=True)
135 | 
136 |     for filename in _expected_files(dataset_and_split):
137 |       self._download_one_file(filename)
138 | 
139 |   def load_annotations(
140 |       self, dataset_and_split: str, max_num_annotations: int = int(1e30)
141 |   ) -> Generator[LocalizedNarrative, None, None]:
142 |     """Loads the Localized Narratives annotations from local files.
143 | 
144 |     Args:
145 |       dataset_and_split: Name of the dataset and split to load. Possible values
146 |         are the keys in _ANNOTATION_FILES.
147 |       max_num_annotations: Maximum number of annotations to load.
148 | 
149 |     Yields:
150 |       One Localized Narrative at a time.
151 |     """
152 |     num_loaded = 0
153 |     for local_file in self._find_files(dataset_and_split):
154 |       self._current_open_file = open(local_file, 'rb')
155 |       for line in self._current_open_file:
156 |         yield LocalizedNarrative(**json.loads(line))
157 |         num_loaded += 1
158 |         if num_loaded == max_num_annotations:
159 |           self._current_open_file.close()
160 |           return
161 |       self._current_open_file.close()
162 | 
163 |   def _local_file(self, filename: str) -> str:
164 |     return os.path.join(self._local_root_dir, filename)
165 | 
166 |   def _find_files(self, dataset_and_split: str) -> Generator[str, None, None]:
167 |     for filename in _expected_files(dataset_and_split):
168 |       if os.path.exists(self._local_file(filename)):
169 |         yield self._local_file(filename)
170 | 
171 |   def _download_one_file(self, filename: str):
172 |     if not os.path.exists(self._local_file(filename)):
173 |       print(f'Downloading: {filename}')
174 |       wget.download(f'{_ANNOTATIONS_ROOT_URL}/{filename}',
175 |                     self._local_file(filename))
176 |       print()
177 |     else:
178 |       print(f'Already downloaded: {filename}')
179 | 


--------------------------------------------------------------------------------
/transcription_example.py:
--------------------------------------------------------------------------------
 1 | # python3
 2 | # coding=utf-8
 3 | # Copyright 2020 The Google Research Authors.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Example call to Google's speech-to-text API to transcribe Localized Narrative recordings.
17 | 
18 | Pre-requisites:
19 | - Set up Google's API authentication:
20 | https://cloud.google.com/docs/authentication/getting-started
21 | - Install dependencies:
22 |   + pip install ffmpeg
23 |   + pip install pydub
24 |   + pip install google-cloud-speech
25 | 
26 | Comments:
27 | - Google's speech-to-text API does not support the Vorbis encoding in which the
28 | Localized Narrative recordings were released. We therefore need to transcode
29 | them Opus, which is supported. We do this in`convert_recording`.
30 | - Transcription is limited to 60 seconds if loaded from a local file. For audio
31 | longer than 1 minute, we need to upload the file to a GCS bucket and load the
32 | audio using its URI: `audio = speech.RecognitionAudio(uri=gcs_uri)`.
33 | """
34 | import io
35 | import os
36 | 
37 | from google.cloud import speech
38 | import pydub
39 | 
40 | 
41 | def convert_recording(input_file, output_file):
42 |   with open(input_file, 'rb') as f:
43 |     recording = pydub.AudioSegment.from_file(f, codec='libvorbis')
44 | 
45 |   with open(output_file, 'wb') as f:
46 |     recording.export(f, format='ogg', codec='libopus')
47 | 
48 | 
49 | def speech_to_text(recording_file):
50 |   # Loads from local file. If longer than 60 seconds, upload to GCS and use
51 |   # `audio = speech.RecognitionAudio(uri=gcs_uri)`
52 |   with io.open(recording_file, 'rb') as audio_file:
53 |     content = audio_file.read()
54 |   audio = speech.RecognitionAudio(content=content)
55 | 
56 |   config = speech.RecognitionConfig(
57 |       encoding=speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
58 |       sample_rate_hertz=48000,
59 |       audio_channel_count=2,
60 |       max_alternatives=10,
61 |       enable_word_time_offsets=True,
62 |       language_code='en-IN')
63 | 
64 |   client = speech.SpeechClient()
65 |   operation = client.long_running_recognize(config=config, audio=audio)
66 |   return operation.result(timeout=90)
67 | 
68 | 
69 | if __name__ == '__main__':
70 | 
71 |   # Input encoded in Vorbis in an OGG container.
72 |   input_recording = '/Users/jponttuset/Downloads/coco_val_137576_93.ogg'
73 |   basename, extension = os.path.splitext(input_recording)
74 |   output_recording = f'{basename}_opus{extension}'
75 | 
76 |   # Re-encodes in Opus and saves to file.
77 |   convert_recording(input_recording, output_recording)
78 | 
79 |   # Actual call to Google's speech-to-text API.
80 |   result = speech_to_text(output_recording)
81 |   print(result)
82 | 


--------------------------------------------------------------------------------