├── .vscode ├── extensions.json ├── settings.json ├── launch.json └── tasks.json ├── host.json ├── scripts ├── pine2.py ├── perfolder.py ├── sqlqueue.py ├── pine.py ├── featureVectors.py ├── mergeCopyFromAzure.py └── pineQuery.py ├── PixelArtSearch.csproj ├── ImageBlobTrigger.cs ├── SqlQueueTrigger.cs ├── FileQueueTrigger.cs ├── PageQueueTrigger.cs ├── ContentQueueTrigger.cs ├── index.html ├── LoadOpenGameArt.cs ├── ZipQueueTrigger.cs ├── .gitignore ├── Common.cs └── README.md /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-dotnettools.csharp" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureFunctions.deploySubpath": "bin/Release/netcoreapp3.1/publish", 3 | "azureFunctions.projectLanguage": "C#", 4 | "azureFunctions.projectRuntime": "~3", 5 | "debug.internalConsoleOptions": "neverOpen", 6 | "azureFunctions.preDeployTask": "publish" 7 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach to .NET Functions", 6 | "type": "coreclr", 7 | "request": "attach", 8 | "processId": "${command:azureFunctions.pickProcess}" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "extensions": { 4 | "blobs": {}, 5 | "queues": { 6 | "batchSize": 1 7 | } 8 | }, 9 | "logging": { 10 | "applicationInsights": { 11 | "samplingSettings": { 12 | "isEnabled": true 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /scripts/pine2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 4 | # Original author: "Eivind Magnus Hvidevold" 5 | # License: GNU GPLv3 at http://www.gnu.org/licenses/gpl.html 6 | 7 | ''' 8 | ''' 9 | 10 | import os 11 | import sys 12 | import re 13 | import json 14 | import numpy as np 15 | import pinecone 16 | import pinecone.graph 17 | import pinecone.service 18 | import pinecone.connector 19 | 20 | def main(): 21 | 'entry point' 22 | 23 | with open('../local.settings.json') as fd: 24 | settings = json.load(fd) 25 | apiKey = settings['Values']['PINECONE'] 26 | 27 | pinecone.init(api_key=apiKey) 28 | 29 | print(pinecone.service.ls()) 30 | #for service in pinecone.service.ls(): 31 | # pinecone.service.stop(service) 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /PixelArtSearch.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netcoreapp3.1 4 | v3 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | PreserveNewest 16 | 17 | 18 | PreserveNewest 19 | Never 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /scripts/perfolder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 4 | # Original author: "Eivind Magnus Hvidevold" 5 | # License: GNU GPLv3 at http://www.gnu.org/licenses/gpl.html 6 | 7 | ''' 8 | ''' 9 | 10 | import os 11 | import sys 12 | import re 13 | 14 | def main(): 15 | 'entry point' 16 | perfolder = {} 17 | for line2 in sys.stdin.readlines(): 18 | line = line2.strip() 19 | if 'extract/' in line: 20 | folders = line.split('/') 21 | if len(folders) < 3: 22 | print('WARN', line) 23 | pass 24 | else: 25 | folder = folders[2] 26 | if folder in perfolder: 27 | perfolder[folder].append(line) 28 | else: 29 | perfolder[folder] = [line] 30 | for key, value in sorted(perfolder.items(), key=lambda x: len(x[1])): 31 | print(key, len(value)) 32 | 33 | if __name__ == '__main__': 34 | main() 35 | 36 | -------------------------------------------------------------------------------- /ImageBlobTrigger.cs: -------------------------------------------------------------------------------- 1 | /* 2 | using System; 3 | using System.IO; 4 | using Microsoft.Azure.WebJobs; 5 | using Microsoft.Azure.WebJobs.Host; 6 | using Microsoft.Extensions.Logging; 7 | using System.Linq; 8 | 9 | namespace HvidevoldDevelopmentENK.GetPixelArt 10 | { 11 | public static class ImageBlobTrigger 12 | { 13 | [FunctionName("ImageBlobTrigger")] 14 | public static void Run( 15 | [BlobTrigger("opengameart/{name}", Connection = "AzureWebJobsStorage")] Stream myBlob, 16 | string name, 17 | [Queue("imgqueue"), StorageAccount("AzureWebJobsStorage")] ICollector msg, 18 | ILogger log) 19 | { 20 | log.LogInformation($"C# ImageBlobTrigger function Processed blob\n Name:{name} \n Size: {myBlob.Length} Bytes"); 21 | 22 | var isJpg = name.Split('.').Last().ToLower() == "png"; 23 | var isPng = name.Split('.').Last().ToLower() == "jpg"; 24 | 25 | if (isJpg || isPng) { 26 | msg.Add(name); 27 | } 28 | } 29 | } 30 | } 31 | */ -------------------------------------------------------------------------------- /SqlQueueTrigger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.WebJobs.Extensions.Http; 7 | using Microsoft.AspNetCore.Http; 8 | using Microsoft.Extensions.Logging; 9 | using System.Net.Http; 10 | using Newtonsoft.Json; 11 | using HtmlAgilityPack; 12 | using System.Web; 13 | using Microsoft.Azure.Storage.Blob; 14 | using System.Linq; 15 | 16 | namespace HvidevoldDevelopmentENK.GetPixelArt 17 | { 18 | public static class SqlQueueTrigger 19 | { 20 | [FunctionName("SqlQueueTrigger")] 21 | public static async Task Run( 22 | [QueueTrigger("sqlqueue", Connection = "AzureWebJobsStorage")] string myQueueItem, 23 | [Blob("opengameart/{queueTrigger}")] CloudBlockBlob blob, 24 | [Queue("imgqueue"), StorageAccount("AzureWebJobsStorage")] ICollector imgs, 25 | ILogger log) 26 | { 27 | log.LogInformation($"C# SqlQueueTrigger function processed: {myQueueItem}"); 28 | 29 | if (await blob.ExistsAsync() && blob.Properties.Length > 0) { 30 | await Common.AfterUploadFile(myQueueItem, blob.Properties.Length, log, imgs); 31 | } 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "clean", 6 | "command": "dotnet", 7 | "args": [ 8 | "clean", 9 | "/property:GenerateFullPaths=true", 10 | "/consoleloggerparameters:NoSummary" 11 | ], 12 | "type": "process", 13 | "problemMatcher": "$msCompile" 14 | }, 15 | { 16 | "label": "build", 17 | "command": "dotnet", 18 | "args": [ 19 | "build", 20 | "/property:GenerateFullPaths=true", 21 | "/consoleloggerparameters:NoSummary" 22 | ], 23 | "type": "process", 24 | "dependsOn": "clean", 25 | "group": { 26 | "kind": "build", 27 | "isDefault": true 28 | }, 29 | "problemMatcher": "$msCompile" 30 | }, 31 | { 32 | "label": "clean release", 33 | "command": "dotnet", 34 | "args": [ 35 | "clean", 36 | "--configuration", 37 | "Release", 38 | "/property:GenerateFullPaths=true", 39 | "/consoleloggerparameters:NoSummary" 40 | ], 41 | "type": "process", 42 | "problemMatcher": "$msCompile" 43 | }, 44 | { 45 | "label": "publish", 46 | "command": "dotnet", 47 | "args": [ 48 | "publish", 49 | "--configuration", 50 | "Release", 51 | "/property:GenerateFullPaths=true", 52 | "/consoleloggerparameters:NoSummary" 53 | ], 54 | "type": "process", 55 | "dependsOn": "clean release", 56 | "problemMatcher": "$msCompile" 57 | }, 58 | { 59 | "type": "func", 60 | "dependsOn": "build", 61 | "options": { 62 | "cwd": "${workspaceFolder}/bin/Debug/netcoreapp3.1" 63 | }, 64 | "command": "host start", 65 | "isBackground": true, 66 | "problemMatcher": "$func-dotnet-watch" 67 | } 68 | ] 69 | } -------------------------------------------------------------------------------- /FileQueueTrigger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.WebJobs.Extensions.Http; 7 | using Microsoft.AspNetCore.Http; 8 | using Microsoft.Extensions.Logging; 9 | using System.Net.Http; 10 | using Newtonsoft.Json; 11 | using HtmlAgilityPack; 12 | using System.Web; 13 | using Microsoft.Azure.Storage.Blob; 14 | using System.Linq; 15 | 16 | namespace HvidevoldDevelopmentENK.GetPixelArt 17 | { 18 | public static class FileQueueTrigger 19 | { 20 | static readonly HttpClient client = new HttpClient(); 21 | 22 | [FunctionName("FileQueueTrigger")] 23 | public static async Task Run( 24 | [QueueTrigger("filequeue", Connection = "AzureWebJobsStorage")] string filename, 25 | [Blob("opengameart/{queueTrigger}")] CloudBlockBlob blob, 26 | [Queue("zipqueue"), StorageAccount("AzureWebJobsStorage")] ICollector msg, 27 | [Queue("sqlqueue"), StorageAccount("AzureWebJobsStorage")] ICollector sqls, 28 | ILogger log) 29 | { 30 | log.LogInformation($"C# FileQueueTrigger function processed page {filename}"); 31 | 32 | try 33 | { 34 | var (responseBody, size) = await Common.ReadURIOrCacheBinary(blob, Common.FileURI + filename, client); 35 | 36 | var isZip = filename.Split('.').Last().ToLower() == "zip"; 37 | var isRar = filename.Split('.').Last().ToLower() == "rar"; 38 | 39 | if (isZip || isRar) { 40 | msg.Add(filename); 41 | } 42 | 43 | sqls.Add(filename); 44 | //await Common.AfterUploadFile(filename, size, log, imgs); 45 | } 46 | catch(HttpRequestException e) 47 | { 48 | log.LogError("\nException Caught!"); 49 | log.LogError("Message :{0} ",e.Message); 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /scripts/sqlqueue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 4 | # Original author: "Eivind Magnus Hvidevold" 5 | # License: GNU GPLv3 at http://www.gnu.org/licenses/gpl.html 6 | 7 | ''' 8 | ''' 9 | 10 | import sys 11 | import re 12 | import json 13 | import os, uuid 14 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__ 15 | from azure.storage.queue import ( 16 | QueueClient, 17 | BinaryBase64EncodePolicy, 18 | BinaryBase64DecodePolicy 19 | ) 20 | 21 | def main(): 22 | 'entry point' 23 | 24 | 25 | def step3(): 26 | with open('../local.settings.json') as fd: 27 | settings = json.load(fd) 28 | connectionString = settings["Values"]["AzureWebJobsStorage"] 29 | #os.environ["AZURE_STORAGE_CONNECTION_STRING"] = connectionString 30 | 31 | container_name = "opengameart" 32 | 33 | with open('putToSqlQueue.txt') as fd: 34 | files = [fname.strip() for fname in fd.readlines()] 35 | #files = [fname for fname in files if fname.lower().endswith('.jpg') or fname.lower().endswith('.png')] 36 | files = files[282000:] 37 | 38 | # Retrieve the connection string from an environment 39 | # variable named AZURE_STORAGE_CONNECTION_STRING 40 | connect_str = os.getenv("AZURE_STORAGE_CONNECTION_STRING") 41 | 42 | # Create a unique name for the queue 43 | #q_name = "queue-" + str(uuid.uuid4()) 44 | q_name = 'sqlqueue' 45 | 46 | # Instantiate a QueueClient object which will 47 | # be used to create and manipulate the queue 48 | #queue_client = QueueClient.from_connection_string(connectionString, q_name) 49 | 50 | # Setup Base64 encoding and decoding functions 51 | base64_queue_client = QueueClient.from_connection_string( 52 | conn_str=connectionString, queue_name=q_name, 53 | message_encode_policy = BinaryBase64EncodePolicy(), 54 | message_decode_policy = BinaryBase64DecodePolicy() 55 | ) 56 | 57 | for i, message in enumerate(files): 58 | if i % 1000 == 0: 59 | print(i) 60 | base64_queue_client.send_message(message.encode('ascii')) 61 | 62 | if __name__ == '__main__': 63 | step3() 64 | -------------------------------------------------------------------------------- /PageQueueTrigger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.WebJobs.Extensions.Http; 7 | using Microsoft.AspNetCore.Http; 8 | using Microsoft.Extensions.Logging; 9 | using System.Net.Http; 10 | using Newtonsoft.Json; 11 | using HtmlAgilityPack; 12 | using System.Web; 13 | using Microsoft.Azure.Storage.Blob; 14 | using System.Collections.Generic; 15 | 16 | namespace HvidevoldDevelopmentENK.GetPixelArt 17 | { 18 | public static class PageQueueTrigger 19 | { 20 | static readonly HttpClient client = new HttpClient(); 21 | 22 | [FunctionName("PageQueueTrigger")] 23 | public static async Task Run( 24 | [QueueTrigger("pagequeue", Connection = "AzureWebJobsStorage")] string page, 25 | [Blob("opengameart/pages/page{queueTrigger}.html")] CloudBlockBlob blob, 26 | [Queue("contentqueue"), StorageAccount("AzureWebJobsStorage")] ICollector msg, 27 | ILogger log) 28 | { 29 | log.LogInformation($"C# PageQueueTrigger function processed page {page}"); 30 | 31 | string responseBody = null; 32 | 33 | try 34 | { 35 | responseBody = await Common.ReadURIOrCache(blob, Common.SearchURI + "&page=" + page, client); 36 | 37 | var htmlDoc = new HtmlDocument(); 38 | htmlDoc.LoadHtml(responseBody); 39 | var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body"); 40 | 41 | var hashSet = new HashSet(); 42 | 43 | foreach (var nNode in htmlBody.Descendants("a")) 44 | { 45 | if (nNode.NodeType == HtmlNodeType.Element && nNode.Attributes["href"] != null && nNode.Attributes["href"].Value.StartsWith("/content/")) 46 | { 47 | hashSet.Add(HttpUtility.HtmlDecode(nNode.Attributes["href"].Value)); 48 | } 49 | } 50 | foreach (var urlPart in hashSet) { 51 | msg.Add(urlPart); 52 | } 53 | } 54 | catch(HttpRequestException e) 55 | { 56 | log.LogError("\nException Caught!"); 57 | log.LogError("Message :{0} ",e.Message); 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /ContentQueueTrigger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.WebJobs.Extensions.Http; 7 | using Microsoft.AspNetCore.Http; 8 | using Microsoft.Extensions.Logging; 9 | using System.Net.Http; 10 | using Newtonsoft.Json; 11 | using HtmlAgilityPack; 12 | using System.Web; 13 | using Microsoft.Azure.Storage.Blob; 14 | 15 | namespace HvidevoldDevelopmentENK.GetPixelArt 16 | { 17 | public static class ContentQueueTrigger 18 | { 19 | static readonly HttpClient client = new HttpClient(); 20 | 21 | [FunctionName("ContentQueueTrigger")] 22 | public static async Task Run( 23 | [QueueTrigger("contentqueue", Connection = "AzureWebJobsStorage")] string page, 24 | [Blob("opengameart/{queueTrigger}.html")] CloudBlockBlob blob, 25 | [Queue("filequeue"), StorageAccount("AzureWebJobsStorage")] ICollector msg, 26 | ILogger log) 27 | { 28 | log.LogInformation($"C# ContentQueueTrigger function processed page {page}"); 29 | 30 | string responseBody = null; 31 | 32 | try 33 | { 34 | responseBody = await Common.ReadURIOrCache(blob, Common.BaseURI + page, client); 35 | 36 | var htmlDoc = new HtmlDocument(); 37 | htmlDoc.LoadHtml(responseBody); 38 | var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body"); 39 | 40 | foreach (var nNode in htmlBody.Descendants("a")) 41 | { 42 | if (nNode.NodeType == HtmlNodeType.Element && 43 | nNode.Attributes["href"] != null && 44 | nNode.Attributes["href"].Value.Contains("/default/files/")) 45 | { 46 | msg.Add(HttpUtility.HtmlDecode(nNode.Attributes["href"].Value.Replace(Common.FileURI, ""))); 47 | } 48 | } 49 | } 50 | catch(HttpRequestException e) 51 | { 52 | log.LogError("\nException Caught!"); 53 | log.LogError("Message :{0} ",e.Message); 54 | log.LogError("Stack :{0}", e.StackTrace.ToString()); 55 | } 56 | catch (NullReferenceException e) { 57 | log.LogError("\nException Caught!"); 58 | log.LogError("Message :{0} ", e.Message); 59 | log.LogError("Stack :{0}", e.StackTrace.ToString()); 60 | } 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | OpenGameArt Search 5 | 6 | 7 | 8 | 9 | 10 | 11 | Fork me on GitHub 12 | 13 |
14 |

Sorry, search is down (forever, saving money on disk space)

15 |

OpenGameArt Search + Reverse Image Search

16 |

Hint: Start search term with http(s):// for reverse image search.

17 |

Made using pinecone.io free trial, so will be live for 1 month until April 3rd 2021.

18 |
19 | 20 | 21 |
22 |
23 |
24 |
25 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /LoadOpenGameArt.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.WebJobs.Extensions.Http; 7 | using Microsoft.AspNetCore.Http; 8 | using Microsoft.Extensions.Logging; 9 | using System.Net.Http; 10 | using Newtonsoft.Json; 11 | using HtmlAgilityPack; 12 | using System.Web; 13 | using Microsoft.Azure.Storage.Blob; 14 | 15 | namespace HvidevoldDevelopmentENK.GetPixelArt 16 | { 17 | public static class HttpTrigger 18 | { 19 | // HttpClient is intended to be instantiated once per application, rather than per-use. See Remarks. 20 | static readonly HttpClient client = new HttpClient(); 21 | 22 | [FunctionName("LoadOpenGameArt")] 23 | public static async Task Run( 24 | [HttpTrigger(AuthorizationLevel.Anonymous, "get", "post", Route = null)] HttpRequest req, 25 | [Queue("pagequeue"),StorageAccount("AzureWebJobsStorage")] ICollector msg, 26 | [Blob("opengameart/pages/index.html")] CloudBlockBlob blob, 27 | ILogger log) 28 | { 29 | log.LogInformation("C# HTTP trigger function processed a request."); 30 | 31 | string name = req.Query["name"]; 32 | 33 | string requestBody = await new StreamReader(req.Body).ReadToEndAsync(); 34 | dynamic data = JsonConvert.DeserializeObject(requestBody); 35 | name = name ?? data?.name; 36 | 37 | string responseMessage = string.IsNullOrEmpty(name) 38 | ? "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response." 39 | : $"Hello, {name}. This HTTP triggered function executed successfully."; 40 | 41 | // Call asynchronous network methods in a try/catch block to handle exceptions. 42 | string responseBody = null; 43 | try 44 | { 45 | //const string uri = "https://opengameart.org/"; 46 | 47 | // using (TextReader tr = new StreamReader(index)) { 48 | responseBody = await Common.ReadURIOrCache(blob, Common.SearchURI, client); 49 | 50 | string html = responseBody; 51 | var htmlDoc = new HtmlDocument(); 52 | htmlDoc.LoadHtml(html); 53 | var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body"); 54 | string page = null; 55 | foreach (var nNode in htmlBody.Descendants("li")) 56 | { 57 | if (nNode.NodeType == HtmlNodeType.Element && nNode.HasClass("pager-last")) 58 | { 59 | var aNode = nNode.Element("a"); 60 | //Console.WriteLine("Node name: " + aNode.Name); 61 | //Console.WriteLine(aNode.InnerText); 62 | //Console.WriteLine(aNode.Attributes["href"].Value); 63 | if (aNode != null) { 64 | var href = HttpUtility.HtmlDecode(aNode.Attributes["href"].Value); 65 | if (href != null) { 66 | Uri myUri = new Uri("https://opengameart.org/" + href); 67 | page = HttpUtility.ParseQueryString(myUri.Query).Get("page"); 68 | log.LogInformation("Last page: " + page); 69 | } 70 | } 71 | } 72 | } 73 | 74 | int ipage = int.Parse(page); 75 | //ipage = 1; 76 | if (page != null && ipage > 0 && ipage < 1000) { 77 | for (int i = 0; i <= ipage; i++) { 78 | msg.Add(string.Format("{0}", i)); 79 | } 80 | } 81 | 82 | //Console.WriteLine(htmlBody.OuterHtml); 83 | } 84 | catch(HttpRequestException e) 85 | { 86 | Console.WriteLine("\nException Caught!"); 87 | Console.WriteLine("Message :{0} ",e.Message); 88 | } 89 | catch(UriFormatException e) { 90 | Console.WriteLine("\nException Caught!"); 91 | Console.WriteLine("Message :{0} ",e.Message); 92 | } 93 | 94 | if (!string.IsNullOrEmpty(name)) 95 | { 96 | // Add a message to the output collection. 97 | //msg.Add(string.Format("Name passed to the function: {0}", name)); 98 | } 99 | 100 | return 101 | responseBody != null ? 102 | new ContentResult { Content = responseBody, ContentType = "text/html" } : 103 | new ContentResult { Content = responseMessage, ContentType = "text/plain" }; 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /ZipQueueTrigger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.Storage; 7 | using Microsoft.Azure.WebJobs.Extensions.Http; 8 | using Microsoft.AspNetCore.Http; 9 | using Microsoft.Extensions.Logging; 10 | using System.Net.Http; 11 | using Newtonsoft.Json; 12 | using HtmlAgilityPack; 13 | using System.Web; 14 | using Microsoft.Azure.Storage.Blob; 15 | using System.Collections.Generic; 16 | using System.Linq; 17 | using SharpCompress.Archives; 18 | using SharpCompress.Archives.Zip; 19 | using SharpCompress.Archives.Rar; 20 | using SharpCompress.Common; 21 | using SharpCompress.Readers; 22 | using System.Text; 23 | 24 | namespace HvidevoldDevelopmentENK.GetPixelArt 25 | { 26 | 27 | public static class ZipQueueTrigger 28 | { 29 | public static async Task Extract(IEnumerable archiveEntries, string zipfile, CloudBlobContainer container, ILogger log, ICollector sqls) { 30 | string lastOutBlobName = Common.ExtractFolder + zipfile + "/" + HttpUtility.UrlEncode(archiveEntries.Last().Key, Encoding.UTF8); 31 | var lastOutBlob = container.GetBlockBlobReference(lastOutBlobName); 32 | if (await lastOutBlob.ExistsAsync() && lastOutBlob.Properties.Length == archiveEntries.Last().Size) { 33 | log.LogInformation($"Last file {lastOutBlobName} already exists, so skipped ALL."); 34 | } else { 35 | foreach (var archiveEntry in archiveEntries.Where(entry => !entry.IsDirectory)) 36 | { 37 | log.LogInformation($"Now processing {archiveEntry.Key}"); 38 | 39 | string outBlobName = Common.ExtractFolder + zipfile + "/" + HttpUtility.UrlEncode(archiveEntry.Key, Encoding.UTF8); 40 | 41 | log.LogInformation($"Writing blob {outBlobName}"); 42 | 43 | NameValidator.ValidateBlobName(outBlobName); 44 | 45 | var blockBlob = container.GetBlockBlobReference(outBlobName); 46 | if (await blockBlob.ExistsAsync() && blockBlob.Properties.Length == archiveEntry.Size) { 47 | log.LogInformation($"{outBlobName} already exists, so skipped."); 48 | } else { 49 | await using var fileStream = archiveEntry.OpenEntryStream(); 50 | await blockBlob.UploadFromStreamAsync(fileStream); 51 | //await Common.AfterUploadFile(outBlobName, blockBlob.Properties.Length, log, imgs); 52 | 53 | log.LogInformation($"{outBlobName} processed successfully and moved to destination container."); 54 | } 55 | sqls.Add(outBlobName); 56 | } 57 | } 58 | } 59 | 60 | [FunctionName("ZipQueueTrigger")] 61 | public static async Task Run([ 62 | QueueTrigger("zipqueue", Connection = "AzureWebJobsStorage")] string zipfile, 63 | [Blob("opengameart/{queueTrigger}")] CloudBlockBlob blob, 64 | [StorageAccount("AzureWebJobsStorage")] CloudStorageAccount storageAccount, 65 | [Queue("sqlqueue"), StorageAccount("AzureWebJobsStorage")] ICollector sqls, 66 | ILogger log) 67 | { 68 | log.LogInformation($"C# ZipQueueTrigger function processed: {zipfile}"); 69 | 70 | var isZip = zipfile.Split('.').Last().ToLower() == "zip"; 71 | var isRar = zipfile.Split('.').Last().ToLower() == "rar"; 72 | 73 | try{ 74 | if(isZip || isRar){ 75 | 76 | CloudBlobClient blobClient = storageAccount.CreateCloudBlobClient(); 77 | 78 | CloudBlobContainer container = blobClient.GetContainerReference(Common.Container); 79 | await container.CreateIfNotExistsAsync(); 80 | 81 | using(MemoryStream blobMemStream = new MemoryStream()){ 82 | 83 | await blob.DownloadToStreamAsync(blobMemStream); 84 | 85 | var zipReaderOptions = new ReaderOptions() 86 | { 87 | ArchiveEncoding = new ArchiveEncoding(Encoding.UTF8, Encoding.UTF8), LookForHeader = true 88 | }; 89 | 90 | log.LogInformation("Blob is a zip/rar file; beginning extraction...."); 91 | blobMemStream.Position = 0; 92 | 93 | if (isZip) { 94 | using (var reader = ZipArchive.Open(blobMemStream, zipReaderOptions)) { 95 | await Extract(reader.Entries, zipfile, container, log, sqls); 96 | } 97 | } else if (isRar) { 98 | using (var reader = RarArchive.Open(blobMemStream, zipReaderOptions)) { 99 | await Extract(reader.Entries, zipfile, container, log, sqls); 100 | } 101 | } 102 | } 103 | } 104 | } 105 | catch(Exception ex){ 106 | log.LogError($"Error! Something went wrong: {ex.Message}"); 107 | } 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # Azure Functions localsettings file 5 | local.settings.json 6 | 7 | # User-specific files 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | bld/ 24 | [Bb]in/ 25 | [Oo]bj/ 26 | [Ll]og/ 27 | 28 | # Visual Studio 2015 cache/options directory 29 | .vs/ 30 | # Uncomment if you have tasks that create the project's static files in wwwroot 31 | #wwwroot/ 32 | 33 | # MSTest test Results 34 | [Tt]est[Rr]esult*/ 35 | [Bb]uild[Ll]og.* 36 | 37 | # NUNIT 38 | *.VisualState.xml 39 | TestResult.xml 40 | 41 | # Build Results of an ATL Project 42 | [Dd]ebugPS/ 43 | [Rr]eleasePS/ 44 | dlldata.c 45 | 46 | # DNX 47 | project.lock.json 48 | project.fragment.lock.json 49 | artifacts/ 50 | 51 | *_i.c 52 | *_p.c 53 | *_i.h 54 | *.ilk 55 | *.meta 56 | *.obj 57 | *.pch 58 | *.pdb 59 | *.pgc 60 | *.pgd 61 | *.rsp 62 | *.sbr 63 | *.tlb 64 | *.tli 65 | *.tlh 66 | *.tmp 67 | *.tmp_proj 68 | *.log 69 | *.vspscc 70 | *.vssscc 71 | .builds 72 | *.pidb 73 | *.svclog 74 | *.scc 75 | 76 | # Chutzpah Test files 77 | _Chutzpah* 78 | 79 | # Visual C++ cache files 80 | ipch/ 81 | *.aps 82 | *.ncb 83 | *.opendb 84 | *.opensdf 85 | *.sdf 86 | *.cachefile 87 | *.VC.db 88 | *.VC.VC.opendb 89 | 90 | # Visual Studio profiler 91 | *.psess 92 | *.vsp 93 | *.vspx 94 | *.sap 95 | 96 | # TFS 2012 Local Workspace 97 | $tf/ 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | *.DotSettings.user 106 | 107 | # JustCode is a .NET coding add-in 108 | .JustCode 109 | 110 | # TeamCity is a build add-in 111 | _TeamCity* 112 | 113 | # DotCover is a Code Coverage Tool 114 | *.dotCover 115 | 116 | # NCrunch 117 | _NCrunch_* 118 | .*crunch*.local.xml 119 | nCrunchTemp_* 120 | 121 | # MightyMoose 122 | *.mm.* 123 | AutoTest.Net/ 124 | 125 | # Web workbench (sass) 126 | .sass-cache/ 127 | 128 | # Installshield output folder 129 | [Ee]xpress/ 130 | 131 | # DocProject is a documentation generator add-in 132 | DocProject/buildhelp/ 133 | DocProject/Help/*.HxT 134 | DocProject/Help/*.HxC 135 | DocProject/Help/*.hhc 136 | DocProject/Help/*.hhk 137 | DocProject/Help/*.hhp 138 | DocProject/Help/Html2 139 | DocProject/Help/html 140 | 141 | # Click-Once directory 142 | publish/ 143 | 144 | # Publish Web Output 145 | *.[Pp]ublish.xml 146 | *.azurePubxml 147 | # TODO: Comment the next line if you want to checkin your web deploy settings 148 | # but database connection strings (with potential passwords) will be unencrypted 149 | #*.pubxml 150 | *.publishproj 151 | 152 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 153 | # checkin your Azure Web App publish settings, but sensitive information contained 154 | # in these scripts will be unencrypted 155 | PublishScripts/ 156 | 157 | # NuGet Packages 158 | *.nupkg 159 | # The packages folder can be ignored because of Package Restore 160 | **/packages/* 161 | # except build/, which is used as an MSBuild target. 162 | !**/packages/build/ 163 | # Uncomment if necessary however generally it will be regenerated when needed 164 | #!**/packages/repositories.config 165 | # NuGet v3's project.json files produces more ignoreable files 166 | *.nuget.props 167 | *.nuget.targets 168 | 169 | # Microsoft Azure Build Output 170 | csx/ 171 | *.build.csdef 172 | 173 | # Microsoft Azure Emulator 174 | ecf/ 175 | rcf/ 176 | 177 | # Windows Store app package directories and files 178 | AppPackages/ 179 | BundleArtifacts/ 180 | Package.StoreAssociation.xml 181 | _pkginfo.txt 182 | 183 | # Visual Studio cache files 184 | # files ending in .cache can be ignored 185 | *.[Cc]ache 186 | # but keep track of directories ending in .cache 187 | !*.[Cc]ache/ 188 | 189 | # Others 190 | ClientBin/ 191 | ~$* 192 | *~ 193 | *.dbmdl 194 | *.dbproj.schemaview 195 | *.jfm 196 | *.pfx 197 | *.publishsettings 198 | node_modules/ 199 | orleans.codegen.cs 200 | 201 | # Since there are multiple workflows, uncomment next line to ignore bower_components 202 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 203 | #bower_components/ 204 | 205 | # RIA/Silverlight projects 206 | Generated_Code/ 207 | 208 | # Backup & report files from converting an old project file 209 | # to a newer Visual Studio version. Backup files are not needed, 210 | # because we have git ;-) 211 | _UpgradeReport_Files/ 212 | Backup*/ 213 | UpgradeLog*.XML 214 | UpgradeLog*.htm 215 | 216 | # SQL Server files 217 | *.mdf 218 | *.ldf 219 | 220 | # Business Intelligence projects 221 | *.rdl.data 222 | *.bim.layout 223 | *.bim_*.settings 224 | 225 | # Microsoft Fakes 226 | FakesAssemblies/ 227 | 228 | # GhostDoc plugin setting file 229 | *.GhostDoc.xml 230 | 231 | # Node.js Tools for Visual Studio 232 | .ntvs_analysis.dat 233 | 234 | # Visual Studio 6 build log 235 | *.plg 236 | 237 | # Visual Studio 6 workspace options file 238 | *.opt 239 | 240 | # Visual Studio LightSwitch build output 241 | **/*.HTMLClient/GeneratedArtifacts 242 | **/*.DesktopClient/GeneratedArtifacts 243 | **/*.DesktopClient/ModelManifest.xml 244 | **/*.Server/GeneratedArtifacts 245 | **/*.Server/ModelManifest.xml 246 | _Pvt_Extensions 247 | 248 | # Paket dependency manager 249 | .paket/paket.exe 250 | paket-files/ 251 | 252 | # FAKE - F# Make 253 | .fake/ 254 | 255 | # JetBrains Rider 256 | .idea/ 257 | *.sln.iml 258 | 259 | # CodeRush 260 | .cr/ 261 | 262 | # Python Tools for Visual Studio (PTVS) 263 | __pycache__/ 264 | *.pyc 265 | 266 | # emh 267 | *.swp 268 | *.swo 269 | *.txt 270 | -------------------------------------------------------------------------------- /scripts/pine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 4 | # Original author: "Eivind Magnus Hvidevold" 5 | # License: GNU GPLv3 at http://www.gnu.org/licenses/gpl.html 6 | 7 | ''' 8 | ''' 9 | 10 | import os 11 | import sys 12 | import re 13 | import json 14 | import numpy as np 15 | import pinecone 16 | import pinecone.graph 17 | import pinecone.service 18 | import pinecone.connector 19 | import time 20 | import multiprocessing 21 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__ 22 | 23 | class DBC(): pass 24 | downloadBlobContext = DBC() 25 | pineconeService = 'opengameart-search2' 26 | storepath = '/mnt/data2/opengameart2/' 27 | 28 | def main(): 29 | 'entry point' 30 | with open('../local.settings.json') as fd: 31 | settings = json.load(fd) 32 | connectionString = settings["Values"]["AzureWebJobsStorage"] 33 | apiKey = settings['Values']['PINECONE'] 34 | 35 | pinecone.init(api_key=apiKey) 36 | 37 | #provision() 38 | loadData(connectionString) 39 | 40 | def downloadBlob(args): 41 | vid, path = args 42 | container_name = downloadBlobContext.container_name 43 | blob_service_client = downloadBlobContext.blob_service_client 44 | container_client = downloadBlobContext.container_client 45 | try: 46 | dest = storepath + path 47 | if os.path.exists(dest): 48 | with open(dest, 'rb') as infd: 49 | filedata = infd.read() 50 | return (vid, path, filedata, True) 51 | else: 52 | blob_client = blob_service_client.get_blob_client(container=container_name, blob=path) 53 | filedata = blob_client.download_blob().content_as_bytes() 54 | return (vid, path, filedata, False) 55 | except: 56 | e = sys.exc_info()[0] 57 | print('Error on:', path, e, file=sys.stderr) 58 | raise 59 | time.sleep(1) 60 | 61 | def listFiles(vidStart, batchSize): 62 | batch = [] 63 | vid = vidStart 64 | with open('allblobs.txt') as fd: 65 | for line in fd: 66 | path = line.strip() 67 | if '__MACOSX' in path: 68 | continue 69 | if path.endswith('.np'): 70 | dest = '/mnt/data2/opengameart2/' + path 71 | #if os.path.exists(dest): 72 | # continue 73 | vid += 1 74 | batch.append([vid, path]) 75 | if len(batch) >= batchSize: 76 | yield batch 77 | batch = [] 78 | if len(batch) > 0: 79 | yield batch 80 | 81 | def loadData(connectionString): 82 | batchSize = 10 83 | container_name = 'opengameart' 84 | blob_service_client = BlobServiceClient.from_connection_string(connectionString) 85 | container_client = blob_service_client.get_container_client("opengameart") 86 | 87 | downloadBlobContext.container_name = container_name 88 | downloadBlobContext.blob_service_client = blob_service_client 89 | downloadBlobContext.container_client = container_client 90 | 91 | conn = pinecone.connector.connect(pineconeService) 92 | batch = [] 93 | paths = [] 94 | items = [] 95 | pool = multiprocessing.Pool(batchSize) 96 | if not os.path.exists('database.txt'): 97 | vidStart = 0 98 | else: 99 | with open('database.txt') as fd: 100 | lines = fd.readlines() 101 | vidStart, lastPath = lines[-1].split('\t') 102 | vidStart = int(vidStart) 103 | 104 | for batch in listFiles(vidStart, batchSize): 105 | for vid, path, filedata, saved in pool.map(downloadBlob, batch): 106 | try: 107 | if len(filedata) == 16384: 108 | dest = storepath + path 109 | dname = os.path.split(dest)[0] 110 | if not os.path.exists(dname): 111 | os.makedirs(dname) 112 | if not saved: 113 | with open(dest, 'wb') as ofd: 114 | ofd.write(filedata) 115 | vector = np.frombuffer(filedata, dtype=np.float32) 116 | assert len(vector) == 4096 117 | items.append((vid, vector)) 118 | paths.append((vid, path)) 119 | print(vid, path) 120 | except: 121 | e = sys.exc_info()[0] 122 | print('Error on:', path, e, file=sys.stderr) 123 | time.sleep(1) 124 | #raise 125 | if len(items) >= 100: 126 | with open('database.txt', 'a') as ofd: 127 | for vid, path in paths: 128 | ofd.write(str(vid) + '\t' + path + '\n') 129 | acks = conn.upsert(items=items).collect() 130 | print(acks) 131 | items = [] 132 | paths = [] 133 | 134 | if len(items) > 0: 135 | acks = conn.upsert(items=items).collect() 136 | print(acks) 137 | 138 | def provision(): 139 | # items = [] 140 | # items.extend((f'class-a-{i}', vec) for i, vec in enumerate(np.random.randn(500, 1024) + 0.0)) 141 | # items.extend((f'class-b-{i}', vec) for i, vec in enumerate(np.random.randn(500, 1024) + 1.0)) 142 | # queries = np.random.randn(100, 1024) + 0.0 # the queries belong to class a 143 | # print(items[0][1].shape) 144 | 145 | graph = pinecone.graph.IndexGraph(engine_type='approximated', metric='cosine', shards=10, replicas=1, node_type=pinecone.utils.constants.NodeType.STANDARD, gateway_replicas=1) 146 | # you can do things like 147 | # # graph.add_read_preprocessor('my_item_transformer_image_uri') 148 | # # graph.add_write_preprocessor('my_query_transformer_image_uri') 149 | # # graph.add_postprocessor('my_postprocessor_image_uri')o 150 | 151 | pinecone.service.deploy(service_name=pineconeService, graph=graph) 152 | 153 | conn = pinecone.connector.connect(pineconeService) 154 | 155 | #acks = conn.upsert(items=items).collect() 156 | #results = conn.query(queries=queries).collect() 157 | #print(results[0]) 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /Common.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | using Microsoft.AspNetCore.Mvc; 5 | using Microsoft.Azure.WebJobs; 6 | using Microsoft.Azure.WebJobs.Extensions.Http; 7 | using Microsoft.AspNetCore.Http; 8 | using Microsoft.Extensions.Logging; 9 | using System.Net.Http; 10 | using Newtonsoft.Json; 11 | using HtmlAgilityPack; 12 | using System.Web; 13 | using Microsoft.Azure.Storage.Blob; 14 | using Microsoft.Data.SqlClient; 15 | using System.Linq; 16 | 17 | namespace HvidevoldDevelopmentENK.GetPixelArt 18 | { 19 | public static class Common 20 | { 21 | public static string BaseURI = "https://opengameart.org"; 22 | public static string FileURI = "https://opengameart.org/sites/default"; 23 | public static string SearchURI = "https://opengameart.org/art-search-advanced?keys=&title=&field_art_tags_tid_op=or&field_art_tags_tid=&name=&field_art_type_tid%5B%5D=9&field_art_licenses_tid%5B%5D=17981&field_art_licenses_tid%5B%5D=2&field_art_licenses_tid%5B%5D=17982&field_art_licenses_tid%5B%5D=3&field_art_licenses_tid%5B%5D=6&field_art_licenses_tid%5B%5D=5&field_art_licenses_tid%5B%5D=10310&field_art_licenses_tid%5B%5D=4&field_art_licenses_tid%5B%5D=8&field_art_licenses_tid%5B%5D=7&sort_by=created&sort_order=DESC&items_per_page=144&Collection="; 24 | 25 | public static string ExtractFolder = "/extract"; 26 | public static string Container = "opengameart"; 27 | 28 | public static bool CreatedDatabase = false; 29 | 30 | public static async Task ReadURIOrCache(CloudBlockBlob blob, string uri, HttpClient client) 31 | { 32 | string responseBody = null; 33 | 34 | if (await blob.ExistsAsync()) 35 | { 36 | responseBody = await blob.DownloadTextAsync(); 37 | } 38 | 39 | if (responseBody == null || responseBody.Length == 0) 40 | { 41 | // Avoid spamming server 42 | await Task.Delay(5000); 43 | responseBody = await client.GetStringAsync(uri); 44 | await blob.UploadTextAsync(responseBody); 45 | } 46 | 47 | return responseBody; 48 | } 49 | 50 | public static async Task> ReadURIOrCacheBinary(CloudBlockBlob blob, string uri, HttpClient client, bool needsData = false) 51 | { 52 | //byte[] responseBody; 53 | MemoryStream ms = new MemoryStream(); 54 | 55 | if (await blob.ExistsAsync() && blob.Properties.Length > 0) { 56 | if (needsData) { 57 | await blob.DownloadToStreamAsync(ms); 58 | } 59 | } 60 | else 61 | { 62 | // Avoid spamming server 63 | await Task.Delay(5000); 64 | await blob.UploadFromStreamAsync(await client.GetStreamAsync(uri)); 65 | if (needsData) { 66 | await blob.DownloadToStreamAsync(ms); 67 | } 68 | } 69 | 70 | return Tuple.Create(ms.ToArray(), blob.Properties.Length); 71 | } 72 | 73 | public static async Task UpdateDatabase(string fileName, long fileSize, ILogger log) { 74 | var connectionString = Environment.GetEnvironmentVariable("SqlDb"); 75 | //var connectionString = ConfigurationManager.ConnectionStrings["MyDB"].ConnectionString; 76 | int rows = 0; 77 | 78 | using (SqlConnection conn = new SqlConnection(connectionString)) 79 | { 80 | conn.Open(); 81 | if (!CreatedDatabase) { 82 | var text = @"if not exists (select * from sysobjects where name='opengameartblobfiles' and xtype='U') 83 | create table opengameartblobfiles ( 84 | id INT NOT NULL IDENTITY PRIMARY KEY, 85 | name VARCHAR(900) NOT NULL UNIQUE, 86 | size INT NOT NULL, 87 | hasfeatures BIT NOT NULL, 88 | inmilvus BIT NOT NULL 89 | )"; 90 | using (SqlCommand cmd = new SqlCommand(text, conn)) 91 | { 92 | // Execute the command and log the # rows affected. 93 | var tableRows = await cmd.ExecuteNonQueryAsync(); 94 | log.LogInformation($"Create Table: {tableRows} rows were updated"); 95 | } 96 | using (SqlCommand cmd = new SqlCommand("@CREATE INDEX inmilvus1 ON opengameartblobfiles (inmilvus);", conn)) { 97 | // Execute the command and log the # rows affected. 98 | var tableRows = await cmd.ExecuteNonQueryAsync(); 99 | log.LogInformation($"Create Index for Table: {tableRows} rows were updated"); 100 | } 101 | CreatedDatabase = true; 102 | } 103 | 104 | var update = $"INSERT INTO opengameartblobfiles (name, size, hasfeatures, inmilvus) VALUES ('{fileName}', {fileSize}, 0, 0);"; 105 | log.LogInformation($"Trying SQL: {update}"); 106 | 107 | using (SqlCommand cmd = new SqlCommand(update, conn)) 108 | { 109 | // Execute the command and log the # rows affected. 110 | rows = await cmd.ExecuteNonQueryAsync(); 111 | log.LogInformation($"UpdateDatabase file: {fileName} of size {fileSize} where {rows} rows were updated."); 112 | } 113 | } 114 | return rows > 0; 115 | } 116 | 117 | public static async Task AfterUploadFile(string fileName, long fileSize, ILogger log, ICollector imgs) { 118 | if (await UpdateDatabase(fileName, fileSize, log)) { 119 | var isJpg = fileName.Split('.').Last().ToLower() == "png"; 120 | var isPng = fileName.Split('.').Last().ToLower() == "jpg"; 121 | 122 | if (isJpg || isPng) { 123 | imgs.Add(fileName); 124 | } 125 | }; 126 | } 127 | } 128 | } -------------------------------------------------------------------------------- /scripts/featureVectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Preparation: 4 | # conda create -n tensorflow-gpu tensorflow-gpu 5 | # conda install -c conda-forge azure-storage-blob 6 | # conda install -c anaconda pillow 7 | # conda install -c anaconda cudatoolkit 8 | # cd ~/.keras/models 9 | # wget https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5 10 | # sudo apt install nvidia-cuda-dev 11 | # 12 | # New instructions: 13 | # https://medium.com/analytics-vidhya/install-tensorflow-gpu-2-4-0-with-cuda-11-0-and-cudnn-8-using-anaconda-8c6472c9653f 14 | # conda create -n cudatoolkit cudatoolkit 15 | # conda install -c conda-forge azure-storage-blob 16 | # conda install -c anaconda pillow 17 | # pip3 install tensorflow-gpu 18 | 19 | import json 20 | import os, uuid 21 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__ 22 | import os 23 | 24 | with open('../local.settings.json') as fd: 25 | settings = json.load(fd) 26 | connectionString = settings["Values"]["AzureWebJobsStorage"] 27 | #os.environ["AZURE_STORAGE_CONNECTION_STRING"] = connectionString 28 | 29 | container_name = "opengameart" 30 | sizes = False 31 | 32 | def step1(): 33 | try: 34 | print("Azure Blob Storage v" + __version__ + " - Python quickstart sample") 35 | 36 | # Quick start code goes here 37 | 38 | # Create the BlobServiceClient object which will be used to create a container client 39 | blob_service_client = BlobServiceClient.from_connection_string(connectionString) 40 | container_client = blob_service_client.get_container_client(container_name) 41 | 42 | print("\nListing blobs...") 43 | 44 | # List the blobs in the container 45 | blob_list = container_client.list_blobs() 46 | for blob in blob_list: 47 | if sizes: 48 | blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob.name) 49 | properties = blob_client.get_blob_properties() 50 | print(blob.name + '\t' + str(properties.size)) 51 | else: 52 | print(blob.name) 53 | 54 | except Exception as ex: 55 | print('Exception:') 56 | print(ex) 57 | 58 | def step2(): 59 | with open('log.txt') as fd: 60 | seen = set([x.strip() for x in fd.readlines()]) 61 | with open('putToSqlQueue.txt') as fd: 62 | files = [fname.strip() for fname in fd.readlines()] 63 | files = [fname for fname in files if fname.lower().endswith('.jpg') or fname.lower().endswith('.png')] 64 | files = [fname for fname in files if not fname in seen] 65 | files = [fname for fname in files if not 'MACOSX' in fname] 66 | print(len(files)) 67 | 68 | from tensorflow.keras.applications.vgg16 import VGG16 69 | from tensorflow.keras.preprocessing import image 70 | from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input 71 | from tensorflow.keras.models import Model 72 | from tensorflow.compiler import xla 73 | #from keras.applications.vgg16 import VGG16 74 | #from keras.preprocessing import image 75 | #from keras.applications.vgg16 import decode_predictions, preprocess_input 76 | #from keras.models import Model 77 | #from tensorflow.compiler import xla 78 | import numpy as np 79 | import time 80 | import os 81 | import sys 82 | import PIL 83 | import json 84 | import math 85 | import multiprocessing 86 | from glob import glob 87 | from PIL import Image 88 | from io import BytesIO 89 | 90 | model = VGG16(weights='imagenet', include_top=True) 91 | feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output) 92 | 93 | def prepImage(img): 94 | x = np.array(img.resize((224, 224)).convert('RGB')) 95 | x = np.expand_dims(x, axis=0) 96 | x = preprocess_input(x) 97 | return x 98 | 99 | logfd = open('log.txt', 'a') 100 | 101 | print("Azure Blob Storage v" + __version__ + " - Python quickstart sample") 102 | 103 | # Create the BlobServiceClient object which will be used to create a container client 104 | blob_service_client = BlobServiceClient.from_connection_string(connectionString) 105 | container_client = blob_service_client.get_container_client("opengameart") 106 | 107 | print("\nProcessing blobs...") 108 | 109 | # List the blobs in the container 110 | for i, fname in enumerate(files): 111 | try: 112 | # Create a blob client using the local file name as the name for the blob 113 | print(i, "Reading blob", fname) 114 | starts = [] 115 | starts.append(time.time()) 116 | blob_client = blob_service_client.get_blob_client(container=container_name, blob=fname) 117 | imgdata = blob_client.download_blob().readall() 118 | 119 | print(i, "Preparing image", fname) 120 | starts.append(time.time()) 121 | file_imgdata = BytesIO(imgdata) 122 | dt = Image.open(file_imgdata) 123 | pimg = prepImage(dt) 124 | 125 | print(i, "Computing feature vector", fname) 126 | starts.append(time.time()) 127 | features = feat_extractor.predict(pimg) 128 | print(i, "Features", features) 129 | 130 | print(i, "Uploading feature vector", fname + ".np") 131 | starts.append(time.time()) 132 | blob_writer = blob_service_client.get_blob_client(container=container_name, blob=fname + ".np") 133 | blob_writer.upload_blob(features.flatten().tobytes(), overwrite=True) 134 | 135 | end = time.time() 136 | print(i, ["read", "prep", "feature", "upload"], [end - start for start in starts]) 137 | 138 | print(i, "Done with", fname) 139 | 140 | print('') 141 | 142 | logfd.write(fname + '\n') 143 | 144 | except Exception as ex: 145 | print('Exception:') 146 | print(ex) 147 | time.sleep(10) 148 | 149 | logfd.close() 150 | 151 | def step3(): 152 | 153 | from tensorflow.keras.applications.vgg16 import VGG16 154 | from tensorflow.keras.preprocessing import image 155 | from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input 156 | from tensorflow.keras.models import Model 157 | from tensorflow.compiler import xla 158 | #from keras.applications.vgg16 import VGG16 159 | #from keras.preprocessing import image 160 | #from keras.applications.vgg16 import decode_predictions, preprocess_input 161 | #from keras.models import Model 162 | #from tensorflow.compiler import xla 163 | import numpy as np 164 | import time 165 | import os 166 | import sys 167 | import PIL 168 | import json 169 | import math 170 | import multiprocessing 171 | from glob import glob 172 | from PIL import Image 173 | from io import BytesIO 174 | 175 | model = VGG16(weights='imagenet', include_top=True) 176 | feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output) 177 | 178 | def prepImage(img): 179 | x = np.array(img.resize((224, 224)).convert('RGB')) 180 | x = np.expand_dims(x, axis=0) 181 | x = preprocess_input(x) 182 | return x 183 | 184 | dt = Image.open(sys.argv[1]) 185 | pimg = prepImage(dt) 186 | features = feat_extractor.predict(pimg) 187 | print("Features", features) 188 | 189 | if __name__ == '__main__': 190 | step3() 191 | -------------------------------------------------------------------------------- /scripts/mergeCopyFromAzure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 4 | # Original author: "Eivind Magnus Hvidevold" 5 | # License: GNU GPLv3 at http://www.gnu.org/licenses/gpl.html 6 | 7 | ''' 8 | ''' 9 | 10 | import sys 11 | import re 12 | import json 13 | import os, uuid 14 | import urllib.parse 15 | import shutil 16 | import time 17 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, __version__ 18 | from collections import defaultdict 19 | from pprint import pprint 20 | # from azure.storage.queue import ( 21 | # QueueClient, 22 | # BinaryBase64EncodePolicy, 23 | # BinaryBase64DecodePolicy 24 | # ) 25 | 26 | def main(): 27 | 'entry point' 28 | 29 | 30 | def step3(): 31 | with open('../local.settings.json') as fd: 32 | settings = json.load(fd) 33 | connectionString = settings["Values"]["AzureWebJobsStorage"] 34 | #os.environ["AZURE_STORAGE_CONNECTION_STRING"] = connectionString 35 | 36 | container_name = "opengameart" 37 | 38 | with open('allblobs2.txt') as fd: 39 | files = [] 40 | sizes = [] 41 | for line in fd.readlines(): 42 | fname, size = line.split('\t') 43 | files.append(fname.strip()) 44 | sizes.append(int(size)) 45 | #files = [fname for fname in files if fname.lower().endswith('.jpg') or fname.lower().endswith('.png')] 46 | 47 | # Retrieve the connection string from an environment 48 | # variable named AZURE_STORAGE_CONNECTION_STRING 49 | connect_str = os.getenv("AZURE_STORAGE_CONNECTION_STRING") 50 | 51 | # Create a unique name for the queue 52 | #q_name = "queue-" + str(uuid.uuid4()) 53 | #q_name = 'sqlqueue' 54 | 55 | # Instantiate a QueueClient object which will 56 | # be used to create and manipulate the queue 57 | #queue_client = QueueClient.from_connection_string(connectionString, q_name) 58 | 59 | # Setup Base64 encoding and decoding functions 60 | # base64_queue_client = QueueClient.from_connection_string( 61 | # conn_str=connectionString, queue_name=q_name, 62 | # message_encode_policy = BinaryBase64EncodePolicy(), 63 | # message_decode_policy = BinaryBase64DecodePolicy() 64 | # ) 65 | 66 | class Vars(): pass 67 | lvars = Vars() 68 | lvars.counts = defaultdict(lambda: 0) 69 | 70 | unpacked = [] 71 | byfilename = {} 72 | byfilenameAndSize = defaultdict(lambda: []) 73 | with open('unpacked.txt') as fd: 74 | for line in fd.readlines(): 75 | path = line.strip() 76 | unpacked.append(path) 77 | key = os.path.split(path)[-1] 78 | byfilename[key] = path 79 | try: 80 | key2 = (key, os.path.getsize(path)) 81 | except: 82 | lvars.counts['failsize'] += 1 83 | pass 84 | byfilenameAndSize[key2].append(path.lower()) 85 | 86 | lvars.existCount = 0 87 | lvars.existSize = 0 88 | lvars.notExistCount = 0 89 | lvars.notExistSize = 0 90 | lvars.prc = 0 91 | lvars.unpacked = set(unpacked) 92 | 93 | def updateExists(localpath, lvars): 94 | lvars.existCount += 1 95 | lvars.existSize += size 96 | lvars.counts['exist' + filetype] += 1 97 | if filetype == 'extracted': 98 | try: 99 | lvars.unpacked.remove(localpath) 100 | except: 101 | lvars.counts['unpackedRemoveFail'] += 1 102 | pass 103 | def linkfile(src, dest2): 104 | dest = '/mnt/data2/opengameart2/' + dest2 105 | dirname = os.path.split(dest)[0] 106 | if not os.path.exists(dirname): 107 | os.makedirs(dirname) 108 | if not os.path.exists(dest): 109 | shutil.copyfile(src, dest) 110 | 111 | ofd = open('copyFromAzure.txt', 'w') 112 | for i, (path, size) in enumerate(zip(files, sizes)): 113 | if i % 1000 == 0: 114 | print(i) 115 | if size == 0: 116 | continue 117 | if path.startswith('extract/files'): 118 | localpath = path[len('extract/files/'):] 119 | prefix = '/mnt/data/opengameart/unpacked/' 120 | unq = urllib.parse.unquote_plus 121 | localpath = unq(localpath) 122 | #print("UNQUOTED", localpath) 123 | #localpath = os.path.split(localpath) 124 | localpath = localpath.split('/') 125 | parts = [urllib.parse.quote(localpath[0])] + [unq(x) for x in localpath[1:]] 126 | parts2 = [urllib.parse.quote(localpath[0])] + [os.path.splitext(urllib.parse.quote(localpath[0]))[0]] + [unq(x) for x in localpath[1:]] 127 | #print("PARTS", parts) 128 | localpath = prefix + os.path.join(*parts) 129 | localpath2 = prefix + os.path.join(*parts2) 130 | filetype = 'extracted' 131 | elif path.startswith('files'): 132 | localpath = path[len('files'):] 133 | localpath2 = None 134 | prefix = '/mnt/data/opengameart/files' 135 | localpath = prefix + localpath 136 | filetype = 'file' 137 | else: 138 | localpath = path 139 | localpath2 = None 140 | prefix = '' 141 | filetype = 'other' 142 | 143 | #if i >= 1000: 144 | # break 145 | key = (os.path.split(localpath)[-1], size) 146 | key2 = (os.path.split(localpath2)[-1], size) if localpath2 != None else None 147 | if os.path.exists(localpath) and os.path.getsize(localpath) == size: 148 | updateExists(localpath, lvars) 149 | linkfile(localpath, path) 150 | elif localpath2 != None and os.path.exists(localpath2) and os.path.getsize(localpath2) == size: 151 | updateExists(localpath2, lvars) 152 | linkfile(localpath2, path) 153 | # elif key in byfilenameAndSize and localpath.lower() in byfilenameAndSize[key]: 154 | # updateExists(localpath, lvars) 155 | # localpath3 = [x for x in byfilenameAndSize[key] if x == localpath.lower()][0] 156 | # linkfile(localpath3, path) 157 | # elif localpath2 != None and key2 in byfilenameAndSize and localpath2.lower() in byfilenameAndSize[key2]: 158 | # updateExists(localpath2, lvars) 159 | # localpath4 = [x for x in byfilenameAndSize[key2] if x == localpath2.lower()][0] 160 | # linkfile(localpath4, path) 161 | else: 162 | ofd.write(path + '\n') 163 | # if filetype == 'extracted': 164 | # if key in byfilenameAndSize: 165 | # byfname = byfilenameAndSize[key] 166 | # print("LOCALPATH_EXPECTED:", localpath) 167 | # print("LOCALPATH_FOUND___:", byfname) 168 | # print("PARTS_____________:", '¤'.join(parts)) 169 | if localpath.endswith('.png') and lvars.prc < 10: 170 | print(localpath) 171 | lvars.prc += 1 172 | #if prc == 10: 173 | # break 174 | lvars.counts['notexist' + filetype] += 1 175 | lvars.notExistCount += 1 176 | lvars.notExistSize += size 177 | #base64_queue_client.send_message(message.encode('ascii')) 178 | print('exists', lvars.existCount, lvars.existSize / 1024 / 1024 / 1024, 'GiB') 179 | print('notExists', lvars.notExistCount, lvars.notExistSize / 1024 / 1024 / 1024, 'GiB') 180 | print("NOMATCH:") 181 | #for fname in unpacked: 182 | # print(fname) 183 | pprint(lvars.counts) 184 | ofd.close() 185 | 186 | def step4(): 187 | with open('../local.settings.json') as fd: 188 | settings = json.load(fd) 189 | connectionString = settings["Values"]["AzureWebJobsStorage"] 190 | 191 | # Create the BlobServiceClient object which will be used to create a container client 192 | blob_service_client = BlobServiceClient.from_connection_string(connectionString) 193 | container_client = blob_service_client.get_container_client("opengameart") 194 | 195 | with open('allblobs.txt') as fd: 196 | paths = [x.strip() for x in fd.readlines()] 197 | 198 | container_name = 'opengameart' 199 | for i, path in enumerate(paths): 200 | if i % 1000 == 0: 201 | print(i, i * 100 // len(paths)) 202 | dest = '/mnt/data2/opengameart2/' + path 203 | if '__MACOSX' in path: 204 | continue 205 | if not os.path.exists(dest): 206 | print(path) 207 | try: 208 | blob_client = blob_service_client.get_blob_client(container=container_name, blob=path) 209 | filedata = blob_client.download_blob().readall() 210 | 211 | dirname = os.path.split(dest)[0] 212 | if not os.path.exists(dirname): 213 | os.makedirs(dirname) 214 | if not os.path.exists(dest): 215 | with open(dest, 'wb') as dofd: 216 | dofd.write(filedata) 217 | except: 218 | e = sys.exc_info()[0] 219 | print('Error on:', path, e, file=sys.stderr) 220 | time.sleep(1) 221 | 222 | if __name__ == '__main__': 223 | step4() 224 | -------------------------------------------------------------------------------- /scripts/pineQuery.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 4 | # Original author: "Eivind Magnus Hvidevold" 5 | # License: GNU GPLv3 at http://www.gnu.org/licenses/gpl.html 6 | 7 | ''' 8 | ''' 9 | 10 | import os 11 | import sys 12 | import re 13 | import json 14 | import numpy as np 15 | import pinecone 16 | import pinecone.graph 17 | import pinecone.service 18 | import pinecone.connector 19 | import time 20 | import multiprocessing 21 | 22 | import urllib 23 | from urllib.parse import urlparse 24 | from urllib.parse import quote, unquote 25 | from tensorflow.keras.applications.vgg16 import VGG16 26 | from tensorflow.keras.preprocessing import image 27 | from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input 28 | from tensorflow.keras.models import Model 29 | from tensorflow.compiler import xla 30 | import numpy as np 31 | import PIL 32 | import math 33 | import multiprocessing 34 | from glob import glob 35 | from PIL import Image 36 | from io import BytesIO 37 | from http.server import BaseHTTPRequestHandler, HTTPServer 38 | from socketserver import ThreadingMixIn, ForkingMixIn 39 | from socket import timeout 40 | import logging 41 | #import httplib2 42 | 43 | def downloadFile(url): 44 | try: 45 | assert url.lower().startswith('http') 46 | response = urllib.request.urlopen(url, timeout=10).read() 47 | return response 48 | except (HTTPError, URLError) as error: 49 | logging.error('Data of %s not retrieved because %s\nURL: %s', name, error, url) 50 | except timeout: 51 | logging.error('socket timed out - URL %s', url) 52 | else: 53 | logging.info('Access successful.') 54 | 55 | def main(): 56 | 'entry point' 57 | with open('../local.settings.json') as fd: 58 | settings = json.load(fd) 59 | apiKey = settings['Values']['PINECONE'] 60 | 61 | db = {} 62 | with open('database.txt', 'r') as fd: 63 | lines = fd.readlines() 64 | for line in lines: 65 | stripped = line.strip() 66 | vid, path = stripped.split('\t') 67 | vid = int(vid) 68 | db[vid] = path 69 | 70 | reversedb = {} 71 | with open('reversedb.txt', 'r') as fd: 72 | lines = fd.readlines() 73 | for line in lines: 74 | stripped = line.strip() 75 | content, file = stripped.split(':', 1) 76 | file = file.replace('"https://opengameart.org/sites/default/', '') 77 | content = content.replace('/mnt/data2/opengameart2/', 'https://opengameart.org/') 78 | content = content.replace('.html', '/') 79 | reversedb[file.lower()] = content 80 | 81 | #pinecone.init(api_key=apiKey) 82 | #conn = pinecone.connector.connect("opengameart-search2") 83 | 84 | model = VGG16(weights='imagenet', include_top=True) 85 | feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output) 86 | 87 | def prepImage(img): 88 | x = np.array(img.resize((224, 224)).convert('RGB')) 89 | x = np.expand_dims(x, axis=0) 90 | x = preprocess_input(x) 91 | return x 92 | 93 | class S(BaseHTTPRequestHandler): 94 | def _set_response(self): 95 | self.send_response(200) 96 | self.send_header('Content-type', 'text/html') 97 | self.end_headers() 98 | 99 | def do_GET(self): 100 | logging.info("GET request,\nPath: %s\nHeaders:\n%s\n", str(self.path), str(self.headers)) 101 | self.send_response(200) 102 | self.send_header('Content-type', 'text/html') 103 | self.end_headers() 104 | self.wfile.write("GET request for {}".format(self.path).encode('utf-8')) 105 | 106 | def do_POST(self): 107 | content_length = int(self.headers['Content-Length']) # <--- Gets the size of data 108 | post_data = self.rfile.read(content_length) # <--- Gets the data itself 109 | logging.info("POST request,\nPath: %s\nHeaders:\n%s\n\nBody:\n%s\n", str(self.path), str(self.headers), post_data.decode('utf-8')) 110 | 111 | #self._set_response() 112 | #self.wfile.write("POST request for {}".format(self.path).encode('utf-8')) 113 | print('HELLO') 114 | 115 | #self._set_response() 116 | imsi = '' 117 | data = post_data.decode('utf-8').strip() 118 | print("Content-type", self.headers['Content-type']) 119 | count = 10 120 | if self.headers['Content-type'].lower() == 'application/x-www-form-urlencoded' and 'imsi=' in data: 121 | query = data #urlparse(self.path).query 122 | print('QUERY', query) 123 | query_components = dict(qc.split("=", 1) for qc in query.split("&")) 124 | imsi = query_components["imsi"] 125 | imsi = unquote(imsi) 126 | imsi = imsi.strip() 127 | ajax = False 128 | elif self.headers['Content-type'].lower() == 'application/json': 129 | body = json.loads(data) 130 | if 'imsi' in body: 131 | imsi = body['imsi'] 132 | if 'count' in body: 133 | try: 134 | count = int(body['count']) 135 | except: 136 | pass 137 | ajax = True 138 | if count < 0: 139 | count = 10 140 | elif count > 100: 141 | count = 100 142 | 143 | print('IMSI', imsi) 144 | # if not imsi and path.lower().endswith('.png') or path.lower().endswith('.jpg'): 145 | # self.send_response(200) 146 | # self.send_header('Content-type','image/' + path.lower()[-3:]) 147 | # self.end_headers() 148 | # dest = '/mnt/data2/opengameart2/' + path 149 | # print('Writing image', dest) 150 | # with open(dest, 'rb') as infd: 151 | # self.wfile.write(infd.read()) 152 | 153 | sys.stdout.flush() 154 | self.send_response(200) 155 | self.send_header('Content-type', 'text/html') 156 | self.end_headers() 157 | #if imsi != '' and imsi.lower().endswith('.png') or imsi.lower().endswith('.jpg') or imsi.lower().endswith('.jpeg'): 158 | #self.wfile.write('

Search Results

'.encode('utf-8')) 159 | 160 | def contentLink(url): 161 | url = re.sub(r'\.zip.*', '.zip', url) 162 | url = re.sub(r'\.rar.*', '.rar', url) 163 | url = unquote(url).lower() 164 | url = url.replace('https://emh.lart.no/opengameart2/extract/', '') 165 | if url in reversedb: 166 | return reversedb[url] 167 | print("NO MATCH:", url) 168 | return 'https://opengameart.org/' 169 | 170 | if imsi != '' and imsi.lower().startswith('http') and '.png' in imsi.lower() or '.jpg' in imsi.lower() or '.jpeg' in imsi.lower() or '.gif' in imsi.lower(): 171 | noresults = True 172 | try: 173 | imgdata = downloadFile(imsi) 174 | file_imgdata = BytesIO(imgdata) 175 | dt = Image.open(file_imgdata) 176 | pimg = prepImage(dt) 177 | features = feat_extractor.predict(pimg) 178 | print("Features", features) 179 | print(features.shape, features.dtype) 180 | 181 | queries = features 182 | 183 | pinecone.init(api_key=apiKey) 184 | conn = pinecone.connector.connect("opengameart-search2") 185 | results = conn.query(queries=queries, top_k=4*count).collect() 186 | print(results) 187 | seenurls = {} 188 | seenscores = {} 189 | currentCount = 0 190 | tail = '' 191 | for result, score in zip(results[0].ids, results[0].scores): 192 | if currentCount >= count: 193 | break 194 | if score in seenscores: 195 | continue 196 | seenscores[score] = True 197 | vid = int(result) 198 | if not ajax: 199 | link = '' 200 | else: 201 | link = '' 202 | head = '' + link + '' 203 | self.wfile.write(head.encode('utf-8')) 204 | if vid in db: 205 | noresults = False 206 | url = 'https://emh.lart.no/opengameart2/' + quote(db[vid][0:-len('.np')]) 207 | if url in seenurls: 208 | continue 209 | seenurls[url] = True 210 | img = '\n' 211 | cl = contentLink(url) 212 | a = '' + img + '' 213 | data = a.encode('utf-8') 214 | self.wfile.write(data) 215 | currentCount += 1 216 | print(result, db[vid]) 217 | tail = '' 218 | self.wfile.write(tail.encode('utf-8')) 219 | except: 220 | self.wfile.write('

Error: Something went wrong.

'.encode('utf-8')) 221 | raise 222 | if noresults: 223 | self.wfile.write('

No results

'.encode('utf-8')) 224 | elif imsi != '' and not 'http' in imsi: 225 | words = imsi.split(' ') 226 | words = [x.lower() for x in words] 227 | noresults = True 228 | results = [] 229 | for path in db.values(): 230 | ok = True 231 | for word in words: 232 | if not word in path.lower(): 233 | ok = False 234 | if ok: 235 | results.append(path) 236 | if len(results) >= count: 237 | break 238 | for result in results: 239 | noresults = False 240 | url = 'https://emh.lart.no/opengameart2/' + quote(result[0:-len('.np')]) 241 | img = '\n' 242 | cl = contentLink(url) 243 | a = '' + img + '' 244 | data = a.encode('utf-8') 245 | self.wfile.write(data) 246 | if noresults: 247 | self.wfile.write('

No results

'.encode('utf-8')) 248 | else: 249 | self.wfile.write("POST request for {}".format(self.path).encode('utf-8')) 250 | 251 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): 252 | """Handle requests in a separate thread.""" 253 | 254 | def run(server_class=ThreadedHTTPServer, handler_class=S, port=7899): 255 | logging.basicConfig(level=logging.INFO) 256 | server_address = ('', port) 257 | httpd = server_class(server_address, handler_class) 258 | logging.info('Starting httpd...\n') 259 | try: 260 | httpd.serve_forever() 261 | except KeyboardInterrupt: 262 | pass 263 | httpd.server_close() 264 | logging.info('Stopping httpd...\n') 265 | 266 | return run 267 | 268 | if __name__ == '__main__': 269 | from sys import argv 270 | 271 | run = main() 272 | if len(argv) == 2: 273 | run(port=int(argv[1])) 274 | else: 275 | run() 276 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pixel Art Reverse Image Search for OpenGameArt 2 | 3 | ## What does the final search look like? 4 | The final search with an example can be found [here](https://emh.lart.no/ogasearch/?imsi=monster%20stone%20soup&count=20). 5 | 6 | It looks like this: ![OpenGameArt Search](https://emh.lart.no/ogasearch/demo.jpg) 7 | 8 | ## Why did I want a reverse image search for OpenGameArt? 9 | I wanted to build a reverse image search for OpenGameArt as Google Image Search and TinEye don't give good results for it. 10 | I had previously generated a [huge tile map](https://opengameart.org/content/all-of-2d-art-on-opengameart-in-1-sprite-sheet) to give an overview of similar images on OpenGameArt, but it wasn't very resource friendly on the web or image browser and had to be split into smaller files, plus it's not searchable in any way, just scrollable. 11 | So I wanted a way for people to explore what kind of art is available on OpenGameArt, and landed on using similarity search to browse the image space. 12 | 13 | ## How did I do the crawling? 14 | The first thing I had to do was retrieve the search results for the query I was interested in on OpenGameArt, mostly the 2D art. 15 | Then I had to retrieve each HTML page which was in the search results index and parse the HTML for links to files. 16 | OpenGameArt contains a lot of archive files like zip and rar files, so I then had to unpack them to get to the images. 17 | 18 | For example here is a snippet showing how to parse the content page and get file links: 19 | ```csharp 20 | responseBody = await Common.ReadURIOrCache(blob, Common.BaseURI + page, client); 21 | 22 | var htmlDoc = new HtmlDocument(); 23 | htmlDoc.LoadHtml(responseBody); 24 | var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//body"); 25 | 26 | foreach (var nNode in htmlBody.Descendants("a")) 27 | { 28 | if (nNode.NodeType == HtmlNodeType.Element && 29 | nNode.Attributes["href"] != null && 30 | nNode.Attributes["href"].Value.Contains("/default/files/")) 31 | { 32 | msg.Add(HttpUtility.HtmlDecode(nNode.Attributes["href"].Value.Replace(Common.FileURI, ""))); 33 | } 34 | } 35 | ``` 36 | 37 | ## Which technology did I use for the crawling and how much did it cost? 38 | I used Azure Functions to do the crawling steps, with some back and forth manual intervention to correct things as needed. 39 | Each step had its own queue and then put the job for the next step on the next queue. 40 | In the end the invocations cost around 50 USD on Azure, for let's say 10-20 million Function invocations if I remember correctly. 41 | 42 | ## Which alternatives did I investigate? 43 | I tried to use the open source [Milvus](https://milvus.io/) database, but it crashed on my DigitalOcean server because I didn't have enough memory on it. 44 | Then I accidentally and luckily discovered the link to [Pinecone](https://www.pinecone.io/) in a Hacker News comment section and decided to use that instead, as the trial was free and I didn't have to expand my server memory to use Milvus. 45 | In the end I expanded my server anyway, but I didn't try [Milvus](https://milvus.io/) again (at least not yet). 46 | 47 | ## What data do you need on each image to create a reverse image search? 48 | I used [VGG16 feature extraction](https://towardsdatascience.com/extract-features-visualize-filters-and-feature-maps-in-vgg16-and-vgg19-cnn-models-d2da6333edd0) in [my script for this](https://github.com/emnh/PixelArtSearch/blob/master/scripts/featureVectors.py). 49 | See the article for more information, but in essence it's 4096 32-bit floating point numbers for each image, which describe various features of the image, say for instance in a very simplified way how many stripes or squares it has or how green it is. 50 | But these features are based on neurons in the neural network for VGG16 (which is usually used for image classification), so the features could be more complicated than what is described by simple feature tags. 51 | And the reason we need these vectors is that it's easy to use Euclidean distance or cosine similarity or another measure on two vectors to see if they are similar, and then consequently the images are similar. 52 | Furthermore there is search technology on these vectors that enable quick search on a large amount of them. 53 | 54 | Here's a simplified python script to show how to do the feature extraction: 55 | ```python 56 | #!/usr/bin/env python3 57 | # -*- coding: utf-8 -*- 58 | # vim: ft=python ts=4 sw=4 sts=4 et fenc=utf-8 59 | 60 | from tensorflow.keras.applications.vgg16 import VGG16 61 | from tensorflow.keras.preprocessing import image 62 | from tensorflow.keras.applications.vgg16 import decode_predictions, preprocess_input 63 | from tensorflow.keras.models import Model 64 | from tensorflow.compiler import xla 65 | import numpy as np 66 | import time 67 | import os 68 | import sys 69 | import PIL 70 | import json 71 | import math 72 | import multiprocessing 73 | from glob import glob 74 | from PIL import Image 75 | from io import BytesIO 76 | 77 | model = VGG16(weights='imagenet', include_top=True) 78 | feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output) 79 | 80 | def prepImage(img): 81 | x = np.array(img.resize((224, 224)).convert('RGB')) 82 | x = np.expand_dims(x, axis=0) 83 | x = preprocess_input(x) 84 | return x 85 | 86 | def main(): 87 | 'entry point' 88 | fname = 'demo.jpg' 89 | dt = Image.open(fname) 90 | pimg = prepImage(dt) 91 | 92 | print("Computing feature vector", fname) 93 | features = feat_extractor.predict(pimg) 94 | print(features) 95 | 96 | if __name__ == '__main__': 97 | main() 98 | ``` 99 | 100 | Here's the output of the script: 101 | ```bash 102 | emh@frostpunk ~/public_html/ogasearch 0% ./test.py (git)-[gh-pages] 103 | 2021-04-07 18:48:03.158023: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory 104 | 2021-04-07 18:48:03.158082: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. 105 | 2021-04-07 18:48:07.783109: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set 106 | 2021-04-07 18:48:07.783485: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory 107 | 2021-04-07 18:48:07.783530: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303) 108 | 2021-04-07 18:48:07.783580: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (frostpunk): /proc/driver/nvidia/version does not exist 109 | 2021-04-07 18:48:07.784058: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA 110 | To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 111 | 2021-04-07 18:48:07.784513: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set 112 | 2021-04-07 18:48:08.599925: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory. 113 | 2021-04-07 18:48:09.194634: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory. 114 | 2021-04-07 18:48:09.385612: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory. 115 | 2021-04-07 18:48:13.033066: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory. 116 | Computing feature vector demo.jpg 117 | 2021-04-07 18:48:13.706621: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2) 118 | 2021-04-07 18:48:13.717564: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz 119 | [[0. 3.1128967 1.5611947 ... 1.2625191 0.7709812 0. ]] 120 | ./test.py 12.20s user 4.66s system 132% cpu 12.731 total 121 | ``` 122 | 123 | ## How did I maintain a link between the image URLs and the vector datebase feature vectors? 124 | I also wanted to put all the image URLs into a SQL database in the end, and have a flag for whether I had made the VGG16 feature extraction and whether it was added to the vector database ([Milvus](https://milvus.io/) or [Pinecone](https://www.pinecone.io/). 125 | It's essential to be able to map back and forth between an integer primary key, which is used in [Pineone](https://www.pinecone.io/), and the URL and perhaps other metadata that belongs to the image, as [Pinecone](https://www.pinecone.io/ doesn't store other metadata than the primary key. 126 | In the end I dumbed the SQL database to a tab separated text file and loaded it on query server startup. 127 | 128 | ## How long time did it take? 129 | I think I spent a week in total to run all the code to finish, each step taking on the order of a day or two, crawl, computing feature vectors. 130 | I don't remember how much time it took to insert the vectors into the [Pinecone](https://www.pinecone.io/) database, but I think it was not the most time-consuming step. 131 | 132 | ## Two ways of searching: Words and images 133 | - There are two ways of searching, either you can put a keyword, which just plainly (and a bit slowly at O(n)) iterates linearly through the URLs looking for a string match. 134 | I stuck with linear search since it's simple to implement and all the URLs are kept in memory anyway so it's not that slow. 135 | I dumped all the URLs to a text file and loading it to memory on query server load instead of querying the SQL server each time. 136 | - The other way of searching is that you put an image URL, which will run feature extraction on your image (on my server) and then query Pinecone for similar vectors, which will map to primary keys, which in turn I look up in the list of URLs. 137 | - I also maintain a "reverse database" text file in order to link back to the OpenGameArt site for images found (there are some bugs with this I haven't fixed yet, in which case it just links to OpenGameArt main page). This file is also loaded on query server startup. 138 | Finally there is also a link under each image to search for similar images, which implicitly uses the second kind of query by image. 139 | 140 | ## What are some problems I encountered? 141 | At the end I also added a quick fix to remove near-duplicate image results which had an identical score. 142 | I ran into some troubles on the search page with "double" URL encoding, because I had stored the files using URL encoding in the file system, but I worked around it with some detection code on the frontend for when the browser double-encoded the URL-encoded file names. 143 | I recommend storing the crawled files without URL encoding. 144 | I regret that my scripts are not so high quality or polished, for example there are multiple steps in scripts and I change things by editing the script instead of taking command line arguments. 145 | I don't feel like posting snippets from the scripts and explaining as they are a bit messy. 146 | Additionally I moved the files from Azure storage to my DigitalOcean server mid-way, before processing the feature extraction, so there's some inconsistent data location handling. 147 | 148 | ## What are the final takeaways? 149 | - I recommend doing the crawl perhaps on a cheaper substrate than Azure Functions and Azure Storage to save some money, for example your own server or a fixed price Cloud server. 150 | Well it just cost 50 USD but I could have done it for free on my DigitalOcean server, so that's why. 151 | - I recommend building a more robust crawler, idempotent and restartable at any point which it may terminate at or require some manual intervention (for example I exceeded the Azure Function maximum run time of 5 minutes on extracting some of the large zip files, so I extracted them running the Functions locally in VS Code). 152 | - One thing I regret that I didn't get done this time was to extract all the tiles from tile sheets to individual images for searching. That would have made the search even more useful. On the other hand it could have cluttered the similarity search with too many nearly identical images. 153 | 154 | ## Conclusion and final remarks 155 | - It might also be useful to prototype the system using a bit of content, then run the whole pipeline end to end on all the content once you get it working, instead of completing the first step in crawl, then doing all feature extraction and then doing all database insertion as I did. 156 | - In conclusion what I made was a bit of a hack, not so robust scripting for updating for new content, but it worked fine as a prototype and gave decent image search results (not always so spot on, but I blame it on that the feature extraction is not really targeted at tiny pixel art (though resized/upscaled before feature extraction)). 157 | - It could be interesting to see whether Milvus could also deliver similar results, a side by side comparison of some kind, on speed and quality, but I found it much easier to use [Pinecone](https://www.pinecone.io/) since it's already up and running as a service so that I didn't have to run my own vector database. 158 | 159 | ## Script Locations 160 | 161 | - Azure Functions OpenGameArt crawler in \*.cs files. I moved the crawled files to DigitalOcean later on because of pricing. 162 | - Scripts for [machine learning](https://github.com/emnh/PixelArtSearch/blob/master/scripts/featureVectors.py) and [Pinecone query server](https://github.com/emnh/PixelArtSearch/blob/master/scripts/pineQuery.py) in /scripts. 163 | - [Front page source](https://github.com/emnh/PixelArtSearch/blob/master/index.html). 164 | - See also [source code for t-SNE embedding of OpenGameArt images](https://github.com/emnh/opengameart/). 165 | --------------------------------------------------------------------------------