├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .gitattributes ├── .github └── workflows │ ├── build.yml │ └── deploy-function.yml ├── .gitignore ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── Analysis ├── Accuracy.twb ├── ChannelAnalysis.twb ├── ChannelVideoViewsDiscrepency.twb ├── ClassificationReliability.r ├── DocumentClassificationTidy.R ├── Election Fraud.twb ├── ElectionFraudRemoved.twb ├── Entities.twb ├── Images │ ├── FoxRecommendations.png │ ├── GroupFlows.png │ ├── GroupImpressions.png │ ├── GroupRecommendationPercent.png │ ├── Groups.png │ ├── LcrImpressions.png │ ├── MediaImpressions.png │ ├── NetGroupRecommendations.png │ ├── NetLcrRecommendations.png │ ├── NetMediaRecommendations.png │ ├── NetRubeiroRecommendations.png │ ├── ReviewerAgreementICC.png │ ├── RubeiroImpressions.png │ └── recfluence_howto.gif ├── MonthlyViewComparison.twb ├── Pendulum.twb ├── Public.twb ├── QTerms.twb ├── RecAccuracy.R ├── RecExport.twb ├── Recfluence New vs Old.twb ├── SamClark.twb ├── SeussRecs.twb ├── Tag Stats.tds ├── TagRecs.twb ├── UserScrape.twb ├── Video Extra.twb ├── WatchTime.twb ├── Working.twb ├── Working2.twb └── danami covid prediction.twb ├── App ├── .config │ └── dotnet-tools.json ├── .devcontainer │ ├── Dockerfile │ ├── devcontainer.json │ └── library-scripts │ │ └── azcli-debian.sh ├── .dockerignore ├── .editorconfig ├── .run │ ├── Branch Create.run.xml │ ├── Build Container.run.xml │ ├── Clean.run.xml │ ├── Collect List.run.xml │ ├── Prod - Collect List.run.xml │ ├── Prod.run.xml │ ├── Schema.run.xml │ └── Update.run.xml ├── .vscode │ ├── extensions.json │ ├── launch.json │ ├── settings.json │ └── tasks.json ├── Dockerfile ├── GitVersion.yml ├── Mutuo.Etl │ ├── AzureManagement │ │ ├── AzureCleaner.cs │ │ └── AzureEx.cs │ ├── Blob │ │ ├── AzureBlobExtensions.cs │ │ ├── AzureBlobFileStore.cs │ │ ├── BlobIndex.cs │ │ ├── JsonlSink.cs │ │ ├── JsonlStore.cs │ │ ├── JsonlStoreExtensions.cs │ │ ├── KeyedCollectionStore.cs │ │ ├── LocalSimpleFileStore.cs │ │ ├── S3Store.cs │ │ └── SimpleFileStore.cs │ ├── Db │ │ ├── DbExtensions.cs │ │ └── LoggedConnection.cs │ ├── DockerRegistry │ │ └── RegistryClient.cs │ ├── GlobalUsings.cs │ ├── Mutuo.Etl.csproj │ └── Pipe │ │ ├── AzureContainers.cs │ │ ├── ContainerLauncher.cs │ │ ├── DependencyGraph.cs │ │ ├── LocalPipeWorker.cs │ │ ├── PipeCfg.cs │ │ ├── PipeCtx.cs │ │ ├── PipeWorker.cs │ │ ├── Pipes.cs │ │ └── TaskGraph.cs ├── Mutuo.Tools │ ├── Mutuo.Tools.csproj │ ├── Program.cs │ └── SchemaTool.cs ├── Recfluence.sln ├── Recfluence.sln.DotSettings ├── SysExtensions │ ├── AutofacExtensions.cs │ ├── Build │ │ └── GitVersionInfo.cs │ ├── Collections │ │ ├── AsyncEnumerableExtensions.cs │ │ ├── CollectionExtensions.cs │ │ ├── DictionaryExtensions.cs │ │ ├── EnumerableExtensions.cs │ │ ├── KeyedCollection.cs │ │ ├── MultiValueDictionary.cs │ │ └── QueueExtensions.cs │ ├── Configuration │ │ └── DataAnnotationValidator.cs │ ├── DateTimeExtensions.cs │ ├── Either.cs │ ├── EnumExtensions.cs │ ├── ExceptionExtensions.cs │ ├── GlobalUsings.cs │ ├── IO │ │ ├── CsvExtensions.cs │ │ ├── FPath.cs │ │ ├── IOExtensions.cs │ │ └── PathExtensions.cs │ ├── LogExtensions.cs │ ├── Net │ │ ├── FunctionExtensions.cs │ │ ├── HttpClientExtensions.cs │ │ ├── HttpExtensions.cs │ │ └── Policies.cs │ ├── NumberExtensions.cs │ ├── Reflection │ │ ├── ExpressionExtensions.cs │ │ └── ReflectionExtensions.cs │ ├── Security │ │ └── NameSecret.cs │ ├── Serialization │ │ ├── CoreSerializeContractResolver.cs │ │ ├── JsonExtensions.cs │ │ ├── JsonStringConverter.cs │ │ ├── JsonlExtensions.cs │ │ ├── StringEnumConverterExtended.cs │ │ └── XmlExtensions.cs │ ├── ShortGuid.cs │ ├── SysExtensions.csproj │ ├── Text │ │ ├── HumanizeExtensions.cs │ │ ├── MetricExtensions.cs │ │ ├── SPath.cs │ │ └── StringExtensions.cs │ ├── Threading │ │ ├── BlockExtensions.cs │ │ ├── Defer.cs │ │ └── TaskExtensions.cs │ └── ValueExtensions.cs ├── Tests │ ├── BlockTest.cs │ ├── DbTests.cs │ ├── FormattingTests.cs │ ├── GlobalUsings.cs │ ├── PipeTests.cs │ ├── ScrapingTests.cs │ ├── SerializationTests.cs │ ├── TestSetup.cs │ ├── Tests.csproj │ └── WatchPageHtml │ │ └── watch_v=gRJnTYHID3w&bpctr=9999999999&hl=en-us.html ├── YtCli │ ├── Commands.cs │ ├── PipeCmd.cs │ ├── Program.cs │ ├── YtCli.csproj │ ├── default.appcfg.json │ ├── dev.appcfg.json │ └── prod.appcfg.json ├── YtFunctions │ ├── .gitignore │ ├── .vscode │ │ ├── extensions.json │ │ ├── launch.json │ │ ├── settings.json │ │ └── tasks.json │ ├── ApiBackend.cs │ ├── ApiRecfluence.cs │ ├── Create.azcli │ ├── HttpResponseEx.cs │ ├── Program.cs │ ├── YtFunctions.csproj │ └── host.json ├── YtReader │ ├── AppCfg.schema.json │ ├── BranchEnvCreator.cs │ ├── Cfg.cs │ ├── Collect │ │ ├── CollectDb.cs │ │ └── CollectExtensions.cs │ ├── ContainerCommand.cs │ ├── Data │ │ ├── Stage.cs │ │ └── StageDb.cs │ ├── DataScripts.cs │ ├── Db │ │ ├── DataformDescriptions.cs │ │ ├── Snowflake.cs │ │ ├── WarehouseCreator.cs │ │ └── YtMartModel.cs │ ├── GlobalUsings.cs │ ├── RecExport.cs │ ├── ResourceCycle.cs │ ├── RootCfg.schema.json │ ├── Search │ │ └── YtSearch.cs │ ├── Setup.cs │ ├── Store │ │ ├── BlobStores.cs │ │ ├── StoreUpgrader.cs │ │ ├── YtIndexResults.cs │ │ ├── YtResults.cs │ │ ├── YtResultsSql.cs │ │ └── readme.md │ ├── UserScrape.cs │ ├── VersionInfo.cs │ ├── Web │ │ ├── AngleExtensions.cs │ │ ├── FlurlExtensions.cs │ │ ├── FlurlProxyClient.cs │ │ ├── WebEx.cs │ │ └── YtGtEx.cs │ ├── Yt │ │ ├── CollectList.cs │ │ ├── CollectListSql.cs │ │ ├── YtClient.cs │ │ ├── YtCollectDb.cs │ │ ├── YtCollectEx.cs │ │ ├── YtCollector.cs │ │ ├── YtModel.cs │ │ ├── YtWeb.cs │ │ └── YtWebExtensions.cs │ ├── YtBackup.cs │ ├── YtConvertWatchTimeFiles.cs │ ├── YtDataform.cs │ ├── YtReader.csproj │ └── YtUpdater.cs ├── docker-run-recfluence.ps1 ├── omnisharp.json └── readme.md ├── DataScripts ├── .devcontainer │ ├── Dockerfile │ └── devcontainer.json ├── .gitignore ├── .vscode │ ├── extensions.json │ ├── launch.json │ ├── settings.json │ └── tasks.json ├── Dockerfile ├── app.py ├── args.py ├── blobstore.py ├── cfg.py ├── jsonl.py ├── log.py ├── readme.md ├── requirements.txt ├── sf.py ├── video_entities.py └── video_entities_test.py ├── Dataform ├── .devcontainer │ ├── Dockerfile │ └── devcontainer.json ├── .dockerignore ├── .gitignore ├── .vscode │ ├── launch.json │ ├── settings.json │ └── tasks.json ├── Dockerfile ├── package-lock.json ├── package.json ├── src │ ├── Dataform.ts │ ├── NodeTypings.d.ts │ └── run.ts └── tsconfig.json ├── Env ├── .devcontainer │ ├── Dockerfile │ └── devcontainer.json ├── backrup_db2.azcli ├── create_seq.azcli ├── github actions.azcli ├── prod_update.azcli └── recfluence_update.azcli ├── LICENSE ├── Site ├── .babelrc ├── .devcontainer │ ├── Dockerfile │ └── devcontainer.json ├── .env.development ├── .env.production ├── .gitignore ├── .vscode │ ├── launch.json │ ├── settings.json │ └── tasks.json ├── Logo.pptx ├── auth0login.html ├── gatsby-config.js ├── gatsby-node.js ├── jest.config.js ├── package-lock.json ├── package.json ├── src │ ├── common │ │ ├── Chart.ts │ │ ├── DbModel.ts │ │ ├── Dim.ts │ │ ├── Elastic.ts │ │ ├── Uri.spec.ts │ │ ├── Uri.ts │ │ ├── Utils.ts │ │ ├── YtApi.ts │ │ ├── YtInteractiveChartHelper.tsx │ │ └── YtModel.ts │ ├── components │ │ ├── Button.tsx │ │ ├── MainLayout.tsx │ │ ├── OutsideClick.tsx │ │ ├── SearchContext.tsx │ │ ├── SiteMenu.tsx │ │ ├── Spinner.tsx │ │ ├── Tag.tsx │ │ ├── UserContext.tsx │ │ ├── UserMenu.tsx │ │ ├── channel │ │ │ └── Channel.tsx │ │ ├── channel_relations │ │ │ ├── ChannelRelations.tsx │ │ │ ├── ChannelRelationsPage.tsx │ │ │ ├── ChannelRelationsTitle.tsx │ │ │ ├── ChannelTags.tsx │ │ │ ├── RecFlows.tsx │ │ │ └── SearchChannels.tsx │ │ ├── review │ │ │ ├── Review.tsx │ │ │ ├── ReviewCommon.tsx │ │ │ ├── ReviewForm.tsx │ │ │ └── ReviewGrid.tsx │ │ └── search │ │ │ ├── VideoPage.tsx │ │ │ ├── VideoSearch.tsx │ │ │ └── VideoSearchResults.tsx │ ├── images │ │ ├── recfluence_logo.svg │ │ ├── recfluence_word.svg │ │ └── recfluence_word_dark.svg │ ├── pages │ │ ├── 404.tsx │ │ ├── index.tsx │ │ ├── review.tsx │ │ ├── search.tsx │ │ └── video.tsx │ ├── styles │ │ └── main.css │ └── types │ │ ├── NodeTypings.d.ts │ │ ├── custom.d.ts │ │ ├── d3fc-label-layout.d.ts │ │ ├── lucene-query-parser.d.ts │ │ └── react-select-async.d.ts ├── static │ ├── favicon.ico │ ├── help │ │ ├── categories_flow_help.png │ │ ├── channel_legend.svg │ │ ├── ideology_selection.png │ │ ├── impression_advantage.svg │ │ └── selection_flow_help.png │ ├── spinner.png │ └── spinner.svg └── tsconfig.json ├── UserScrape ├── .devcontainer │ ├── Dockerfile │ └── devcontainer.json ├── .dockerignore ├── .gitignore ├── .vscode │ ├── extensions.json │ ├── launch.json │ ├── settings.json │ └── tasks.json ├── Dockerfile ├── app.py ├── cfg_generalte_schema.py ├── readme.md ├── requirements.txt ├── sandbox.py ├── tox.ini ├── userscrape.schema.json └── userscrape │ ├── __init__.py │ ├── cfg.py │ ├── crawler.py │ ├── data.py │ ├── discord_bot.py │ ├── format.py │ ├── log.py │ ├── results.py │ └── store.py ├── azure-pipelines.yml ├── chromedriver └── readme.md /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.154.0/containers/dotnet/.devcontainer/base.Dockerfile 2 | 3 | # [Choice] .NET version: 5.0, 3.1, 2.1 4 | ARG VARIANT="5.0" 5 | FROM mcr.microsoft.com/vscode/devcontainers/dotnetcore:0-${VARIANT} 6 | 7 | # [Option] Install Node.js 8 | ARG INSTALL_NODE="true" 9 | ARG NODE_VERSION="lts/*" 10 | RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "source /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi 11 | 12 | # [Option] Install Azure CLI 13 | ARG INSTALL_AZURE_CLI="false" 14 | COPY library-scripts/azcli-debian.sh /tmp/library-scripts/ 15 | RUN if [ "$INSTALL_AZURE_CLI" = "true" ]; then bash /tmp/library-scripts/azcli-debian.sh; fi \ 16 | && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/library-scripts 17 | 18 | # [Optional] Uncomment this section to install additional OS packages. 19 | # RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 20 | # && apt-get -y install --no-install-recommends 21 | 22 | # [Optional] Uncomment this line to install global node packages. 23 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1 -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.154.0/containers/dotnet 3 | { 4 | "name": "C# (.NET)", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "args": { 8 | // Update 'VARIANT' to pick a .NET Core version: 2.1, 3.1, 5.0 9 | "VARIANT": "3.1", 10 | // Options 11 | "INSTALL_NODE": "false", 12 | "NODE_VERSION": "lts/*", 13 | "INSTALL_AZURE_CLI": "true" 14 | } 15 | }, 16 | 17 | // Set *default* container specific settings.json values on container create. 18 | "settings": { 19 | "terminal.integrated.shell.linux": "/bin/bash" 20 | }, 21 | 22 | // Add the IDs of extensions you want installed when the container is created. 23 | "extensions": [ 24 | "ms-dotnettools.csharp" 25 | ], 26 | 27 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 28 | // "forwardPorts": [5000, 5001], 29 | 30 | // [Optional] To reuse of your local HTTPS dev cert: 31 | // 32 | // 1. Export it locally using this command: 33 | // * Windows PowerShell: 34 | // dotnet dev-certs https --trust; dotnet dev-certs https -ep "$env:USERPROFILE/.aspnet/https/aspnetapp.pfx" -p "SecurePwdGoesHere" 35 | // * macOS/Linux terminal: 36 | // dotnet dev-certs https --trust; dotnet dev-certs https -ep "${HOME}/.aspnet/https/aspnetapp.pfx" -p "SecurePwdGoesHere" 37 | // 38 | // 2. Uncomment these 'remoteEnv' lines: 39 | // "remoteEnv": { 40 | // "ASPNETCORE_Kestrel__Certificates__Default__Password": "SecurePwdGoesHere", 41 | // "ASPNETCORE_Kestrel__Certificates__Default__Path": "/home/vscode/.aspnet/https/aspnetapp.pfx", 42 | // }, 43 | // 44 | // 3. Do one of the following depending on your scenario: 45 | // * When using GitHub Codespaces and/or Remote - Containers: 46 | // 1. Start the container 47 | // 2. Drag ~/.aspnet/https/aspnetapp.pfx into the root of the file explorer 48 | // 3. Open a terminal in VS Code and run "mkdir -p /home/vscode/.aspnet/https && mv aspnetapp.pfx /home/vscode/.aspnet/https" 49 | // 50 | // * If only using Remote - Containers with a local container, uncomment this line instead: 51 | // "mounts": [ "source=${env:HOME}${env:USERPROFILE}/.aspnet/https,target=/home/vscode/.aspnet/https,type=bind" ], 52 | 53 | // Use 'postCreateCommand' to run commands after the container is created. 54 | // "postCreateCommand": "dotnet restore", 55 | 56 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 57 | //"remoteUser": "vscode" 58 | } 59 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - App/** 9 | workflow_dispatch: 10 | 11 | jobs: 12 | docker: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v2 17 | with: 18 | fetch-depth: 0 19 | - name: Install GitVersion 20 | uses: gittools/actions/gitversion/setup@v0.9.9 21 | with: 22 | versionSpec: '5.x' 23 | - name: Determine Version 24 | id: gitversion 25 | uses: gittools/actions/gitversion/execute@v0.9.9 26 | - name: Login to ACR 27 | uses: docker/login-action@v1 28 | with: 29 | registry: ytnetworks.azurecr.io 30 | username: ${{ secrets.REGISTRY_USER }} 31 | password: ${{ secrets.REGISTRY_PASS }} 32 | - name: Build and push 33 | uses: docker/build-push-action@v2 34 | with: 35 | context: ./App 36 | platforms: linux/amd64 37 | push: true 38 | build-args: | 39 | SEMVER=${{ steps.gitversion.outputs.semVer }} 40 | ASSEMBLY_SEMVER=${{ steps.gitversion.outputs.assemblySemFileVer }} 41 | tags: | 42 | ytnetworks.azurecr.io/recfluence:latest 43 | ytnetworks.azurecr.io/recfluence:${{ steps.gitversion.outputs.semVer }} 44 | -------------------------------------------------------------------------------- /.github/workflows/deploy-function.yml: -------------------------------------------------------------------------------- 1 | name: deploy-function 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - App/YtFunctions/** 9 | workflow_dispatch: 10 | 11 | # CONFIGURATION 12 | # For help, go to https://github.com/Azure/Actions 13 | # 14 | # 1. Set up the following secrets in your repository: 15 | # AZURE_FUNCTIONAPP_PUBLISH_PROFILE 16 | # 17 | # 2. Change these variables for your configuration: 18 | env: 19 | AZURE_FUNCTIONAPP_NAME: recfluence # set this to your application's name 20 | AZURE_FUNCTIONAPP_PACKAGE_PATH: './App/YtFunctions' # set this to the path to your web app project, defaults to the repository root 21 | DOTNET_VERSION: '6.0.100' # set this to the dotnet version to use 22 | 23 | jobs: 24 | build-and-deploy: 25 | runs-on: ubuntu-latest 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v2 29 | with: 30 | fetch-depth: 0 31 | - name: Install GitVersion 32 | uses: gittools/actions/gitversion/setup@v0.9.9 33 | with: 34 | versionSpec: '5.x' 35 | - name: Determine Version 36 | id: gitversion 37 | uses: gittools/actions/gitversion/execute@v0.9.9 38 | 39 | - name: Setup DotNet ${{ env.DOTNET_VERSION }} Environment 40 | uses: actions/setup-dotnet@v1 41 | with: 42 | dotnet-version: ${{ env.DOTNET_VERSION }} 43 | - name: 'Build' 44 | shell: bash 45 | run: | 46 | pushd './${{ env.AZURE_FUNCTIONAPP_PACKAGE_PATH }}' 47 | dotnet build --configuration Release --output ./output /p:Version=${{ steps.gitversion.outputs.assemblySemFileVer }} /p:InformationalVersion=${{ steps.gitversion.outputs.semVer }} 48 | popd 49 | - name: 'Build & Deploy Function' 50 | uses: Azure/functions-action@v1 51 | id: fa 52 | with: 53 | app-name: ${{ env.AZURE_FUNCTIONAPP_NAME }} 54 | package: '${{ env.AZURE_FUNCTIONAPP_PACKAGE_PATH }}/output' 55 | publish-profile: ${{ secrets.AZURE_FUNCTIONAPP_PUBLISH_PROFILE }} 56 | 57 | # For more samples to get started with GitHub Action workflows to deploy to Azure, refer to https://github.com/Azure/actions-workflow-samples 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore all directories starting with "." with some exceptions 2 | .idea/ 3 | .Data/ 4 | *local.*.json 5 | 6 | 7 | # vs user settings 8 | *.user 9 | 10 | # Build results 11 | [Dd]ebug/ 12 | [Dd]ebugPublic/ 13 | [Rr]elease/ 14 | [Rr]eleases/ 15 | x64/ 16 | x86/ 17 | bld/ 18 | [Bb]in/ 19 | [Oo]bj/ 20 | [Ll]og/ 21 | 22 | # npm 23 | node_modules/ 24 | 25 | # Python Tools for Visual Studio (PTVS) 26 | __pycache__/ 27 | *.pyc 28 | 29 | # Tableau & R 30 | *.twbr 31 | .Rhistory 32 | .RData 33 | UserScrape/.env 34 | UserScrape/geckodriver.log 35 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-vscode.csharp" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to find out which attributes exist for C# debugging 3 | // Use hover for the description of the existing attributes 4 | // For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "PySpark", 9 | "type": "python", 10 | "request": "launch", 11 | "osx": { 12 | "pythonPath": "${env:SPARK_HOME}/bin/spark-submit" 13 | }, 14 | "windows": { 15 | "pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd" 16 | }, 17 | "linux": { 18 | "pythonPath": "${env:SPARK_HOME}/bin/spark-submit" 19 | }, 20 | "program": "${file}" 21 | }, 22 | { 23 | "type": "chrome", 24 | "request": "launch", 25 | "name": "Launch Chrome against localhost", 26 | "url": "http://localhost:8000", 27 | "webRoot": "${workspaceFolder}" 28 | }, 29 | { 30 | "name": "Debug recfluence update", 31 | "type": "coreclr", 32 | "request": "launch", 33 | "preLaunchTask": "build YtCli", 34 | // If you have changed target frameworks, make sure to update the program path. 35 | "program": "./bin/Debug/netcoreapp3.1/recfluence.dll", 36 | "args": [ 37 | "update", 38 | "-a", 39 | "Stage", 40 | "-f", 41 | "-s", 42 | "video_stage" 43 | ], 44 | "cwd": "${workspaceFolder}/App/YtCli/", "console": "internalConsole", 45 | "stopAtEntry": false, 46 | "internalConsoleOptions": "openOnSessionStart" 47 | }, 48 | { 49 | "name": "Debug YtFunctions", 50 | "type": "coreclr", 51 | "request": "attach", 52 | "processId": "${command:azureFunctions.pickProcess}", 53 | } 54 | ] 55 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureFunctions.projectRuntime": "~3", 3 | "azureFunctions.projectLanguage": "C#", 4 | "azureFunctions.templateFilter": "Verified", 5 | "azureFunctions.preDeployTask": "publish YtFunctions", 6 | "debug.internalConsoleOptions": "neverOpen", 7 | "dotnet-test-explorer.testProjectPath": "/App/Tests", 8 | "python.dataScience.jupyterServerURI": " http://localhost:8888/?token=e272f55131dfdbf050b519e59b2e77600b76178603e5e338", 9 | "python.pythonPath": "C:\\Users\\mark\\AppData\\Local\\Programs\\Python\\Python38-32\\python.exe" 10 | } 11 | -------------------------------------------------------------------------------- /Analysis/ClassificationReliability.r: -------------------------------------------------------------------------------- 1 | library(lpSolve) 2 | library(irr) 3 | library(dplyr) 4 | library(tidyr) 5 | 6 | options(stringsAsFactors = FALSE) 7 | 8 | getResult <- function(name) { 9 | con <- gzcon(url(paste("https://pyt.blob.core.windows.net/data/results/v2/latest/", name, ".csv.gz", sep=""))) 10 | txt <- readLines(con) 11 | return(read.csv(textConnection(txt), header=TRUE, quote="\"")) 12 | } 13 | 14 | # helpfull articles 15 | # simple explanation http://neoacademic.com/2011/11/16/computing-intraclass-correlations-icc-as-estimates-of-interrater-reliability-in-spss/ 16 | # specific to irr https://www.r-bloggers.com/k-is-for-cohens-kappa/ 17 | # another irr one.maybe cappa is better http://www.cookbook-r.com/Statistical_analysis/Inter-rater_reliability/ 18 | 19 | results = data.frame(tag=character(), subjects=integer(), raters=integer(), 20 | kap=numeric(), kap_p=numeric(), 21 | icc=numeric(), icc_p=numeric(), icc_lbound=numeric(),icc_ubound=numeric(), 22 | agreement=numeric()) 23 | 24 | addResult <- function(name, ratings) { 25 | agree = agree(ratings) 26 | kap = kappam.fleiss(ratings) 27 | icc = icc(ratings, model="twoway", type="agreement") 28 | df = data.frame(name=name, subjects=kap$subjects, raters=kap$raters, 29 | kap = kap$value, kap_p = kap$p.value, 30 | icc = icc$value, icc_p = icc$p.value, icc_lbound=icc$lbound, icc_ubound=icc$ubound, 31 | agreement = agree$value) 32 | rbind(results, df) 33 | } 34 | 35 | 36 | 37 | # 38 | # lr ICC 39 | # 40 | 41 | lr = getResult("icc_lr") 42 | 43 | reviwers = c("Ac", "os", "zY") # filter out reviewers with only a handfull of classifications 44 | lrCodes = c( "L" = -1, "C" = 0, "R" = 1) 45 | 46 | lrRatings = lr %>% 47 | filter(REVIEWER %in% reviwers) %>% 48 | mutate(REVIEWER_LR = lrCodes[REVIEWER_LR]) %>% 49 | spread(REVIEWER, REVIEWER_LR, sep="_") %>% 50 | select(starts_with("REVIEWER")) 51 | 52 | 53 | results = addResult("Left/Center/Right", lrRatings) 54 | 55 | 56 | # 57 | # tag ICC 58 | # 59 | tag = getResult("icc_tags") 60 | uniqueTags = unique(tag$TAG) 61 | 62 | for(t in uniqueTags) { 63 | tagRatings = tag %>% 64 | filter(REVIEWER %in% reviwers & TAG == t) %>% 65 | mutate(REVIEWER_HAS_TAG = as.logical(REVIEWER_HAS_TAG)) %>% 66 | spread(REVIEWER, REVIEWER_HAS_TAG, sep="_") %>% 67 | select(starts_with("REVIEWER")) 68 | 69 | results = addResult(t, tagRatings) 70 | } 71 | 72 | print(results) 73 | 74 | write.csv(results, "reviewer_reliability.csv", row.names = FALSE, sep = ",") 75 | 76 | -------------------------------------------------------------------------------- /Analysis/Images/FoxRecommendations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/FoxRecommendations.png -------------------------------------------------------------------------------- /Analysis/Images/GroupFlows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/GroupFlows.png -------------------------------------------------------------------------------- /Analysis/Images/GroupImpressions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/GroupImpressions.png -------------------------------------------------------------------------------- /Analysis/Images/GroupRecommendationPercent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/GroupRecommendationPercent.png -------------------------------------------------------------------------------- /Analysis/Images/Groups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/Groups.png -------------------------------------------------------------------------------- /Analysis/Images/LcrImpressions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/LcrImpressions.png -------------------------------------------------------------------------------- /Analysis/Images/MediaImpressions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/MediaImpressions.png -------------------------------------------------------------------------------- /Analysis/Images/NetGroupRecommendations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/NetGroupRecommendations.png -------------------------------------------------------------------------------- /Analysis/Images/NetLcrRecommendations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/NetLcrRecommendations.png -------------------------------------------------------------------------------- /Analysis/Images/NetMediaRecommendations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/NetMediaRecommendations.png -------------------------------------------------------------------------------- /Analysis/Images/NetRubeiroRecommendations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/NetRubeiroRecommendations.png -------------------------------------------------------------------------------- /Analysis/Images/ReviewerAgreementICC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/ReviewerAgreementICC.png -------------------------------------------------------------------------------- /Analysis/Images/RubeiroImpressions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/RubeiroImpressions.png -------------------------------------------------------------------------------- /Analysis/Images/recfluence_howto.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Analysis/Images/recfluence_howto.gif -------------------------------------------------------------------------------- /App/.config/dotnet-tools.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "isRoot": true, 4 | "tools": { 5 | "gitversion.tool": { 6 | "version": "5.6.6", 7 | "commands": [ 8 | "dotnet-gitversion" 9 | ] 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /App/.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.166.1/containers/dotnet/.devcontainer/base.Dockerfile 2 | 3 | # [Choice] .NET version: 5.0, 3.1, 2.1 4 | ARG VARIANT="5.0" 5 | FROM mcr.microsoft.com/vscode/devcontainers/dotnetcore:0-${VARIANT} 6 | 7 | # [Option] Install Node.js 8 | ARG INSTALL_NODE="true" 9 | ARG NODE_VERSION="lts/*" 10 | RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi 11 | 12 | # [Option] Install Azure CLI 13 | ARG INSTALL_AZURE_CLI="false" 14 | COPY library-scripts/azcli-debian.sh /tmp/library-scripts/ 15 | RUN if [ "$INSTALL_AZURE_CLI" = "true" ]; then bash /tmp/library-scripts/azcli-debian.sh; fi \ 16 | && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/library-scripts 17 | 18 | # [Optional] Uncomment this section to install additional OS packages. 19 | # RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 20 | # && apt-get -y install --no-install-recommends 21 | 22 | # [Optional] Uncomment this line to install global node packages. 23 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1 -------------------------------------------------------------------------------- /App/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.166.1/containers/dotnet 3 | { 4 | "name": "C# (.NET)", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "args": { 8 | // Update 'VARIANT' to pick a .NET Core version: 2.1, 3.1, 5.0 9 | "VARIANT": "5.0", 10 | // Options 11 | "INSTALL_NODE": "false", 12 | "NODE_VERSION": "lts/*", 13 | "INSTALL_AZURE_CLI": "true" 14 | } 15 | }, 16 | 17 | // Set *default* container specific settings.json values on container create. 18 | "settings": { 19 | "terminal.integrated.shell.linux": "/bin/bash" 20 | }, 21 | 22 | // Add the IDs of extensions you want installed when the container is created. 23 | "extensions": [ 24 | "ms-dotnettools.csharp" 25 | ], 26 | 27 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 28 | // "forwardPorts": [5000, 5001], 29 | 30 | // [Optional] To reuse of your local HTTPS dev cert: 31 | // 32 | // 1. Export it locally using this command: 33 | // * Windows PowerShell: 34 | // dotnet dev-certs https --trust; dotnet dev-certs https -ep "$env:USERPROFILE/.aspnet/https/aspnetapp.pfx" -p "SecurePwdGoesHere" 35 | // * macOS/Linux terminal: 36 | // dotnet dev-certs https --trust; dotnet dev-certs https -ep "${HOME}/.aspnet/https/aspnetapp.pfx" -p "SecurePwdGoesHere" 37 | // 38 | // 2. Uncomment these 'remoteEnv' lines: 39 | // "remoteEnv": { 40 | // "ASPNETCORE_Kestrel__Certificates__Default__Password": "SecurePwdGoesHere", 41 | // "ASPNETCORE_Kestrel__Certificates__Default__Path": "/home/vscode/.aspnet/https/aspnetapp.pfx", 42 | // }, 43 | // 44 | // 3. Do one of the following depending on your scenario: 45 | // * When using GitHub Codespaces and/or Remote - Containers: 46 | // 1. Start the container 47 | // 2. Drag ~/.aspnet/https/aspnetapp.pfx into the root of the file explorer 48 | // 3. Open a terminal in VS Code and run "mkdir -p /home/vscode/.aspnet/https && mv aspnetapp.pfx /home/vscode/.aspnet/https" 49 | // 50 | // * If only using Remote - Containers with a local container, uncomment this line instead: 51 | // "mounts": [ "source=${env:HOME}${env:USERPROFILE}/.aspnet/https,target=/home/vscode/.aspnet/https,type=bind" ], 52 | 53 | // Use 'postCreateCommand' to run commands after the container is created. 54 | "postCreateCommand": "dotnet restore && dotnet tool restore", 55 | 56 | // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 57 | //"remoteUser": "vscode" 58 | } 59 | -------------------------------------------------------------------------------- /App/.devcontainer/library-scripts/azcli-debian.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #------------------------------------------------------------------------------------------------------------- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 5 | #------------------------------------------------------------------------------------------------------------- 6 | # 7 | # Docs: https://github.com/microsoft/vscode-dev-containers/blob/master/script-library/docs/azcli.md 8 | # Maintainer: The VS Code and Codespaces Teams 9 | # 10 | # Syntax: ./azcli-debian.sh 11 | 12 | set -e 13 | 14 | if [ "$(id -u)" -ne 0 ]; then 15 | echo -e 'Script must be run as root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.' 16 | exit 1 17 | fi 18 | 19 | export DEBIAN_FRONTEND=noninteractive 20 | 21 | # Install curl, apt-transport-https, lsb-release, or gpg if missing 22 | if ! dpkg -s apt-transport-https curl ca-certificates lsb-release > /dev/null 2>&1 || ! type gpg > /dev/null 2>&1; then 23 | if [ ! -d "/var/lib/apt/lists" ] || [ "$(ls /var/lib/apt/lists/ | wc -l)" = "0" ]; then 24 | apt-get update 25 | fi 26 | apt-get -y install --no-install-recommends apt-transport-https curl ca-certificates lsb-release gnupg2 27 | fi 28 | 29 | # Install the Azure CLI 30 | echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/azure-cli.list 31 | curl -sL https://packages.microsoft.com/keys/microsoft.asc | (OUT=$(apt-key add - 2>&1) || echo $OUT) 32 | apt-get update 33 | apt-get install -y azure-cli 34 | echo "Done!" -------------------------------------------------------------------------------- /App/.dockerignore: -------------------------------------------------------------------------------- 1 | 2 | */node_modules 3 | npm-debug.log 4 | */Dockerfile 5 | docker-compose* 6 | .dockerignore 7 | .gitignore 8 | .env 9 | */bin 10 | */obj 11 | */publish 12 | README.md 13 | LICENSE 14 | /Site 15 | /Analysis 16 | 17 | # local configuration 18 | */local.appcfg.json 19 | */local.rootcfg.json 20 | 21 | # any dir starting with. (except git) 22 | */.idea 23 | */.vscode 24 | */.Data 25 | -------------------------------------------------------------------------------- /App/.run/Branch Create.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 20 | -------------------------------------------------------------------------------- /App/.run/Build Container.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 20 | -------------------------------------------------------------------------------- /App/.run/Clean.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 20 | -------------------------------------------------------------------------------- /App/.run/Collect List.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 20 | -------------------------------------------------------------------------------- /App/.run/Prod - Collect List.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 25 | -------------------------------------------------------------------------------- /App/.run/Prod.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 24 | -------------------------------------------------------------------------------- /App/.run/Schema.run.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 20 | -------------------------------------------------------------------------------- /App/.run/Update.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 20 | -------------------------------------------------------------------------------- /App/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-vscode.csharp" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /App/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "inputs": [ 4 | { 5 | "id": "ytcli_verb", 6 | "options": ["index", "update", "results", "sunc-db", "version", "warehouse"], 7 | "type": "pickString", 8 | "description": "Choose the YtCli command to debug" 9 | } 10 | ], 11 | "configurations": [ 12 | { 13 | "name": ".NET Core Launch (console)", 14 | "type": "coreclr", 15 | "request": "launch", 16 | "preLaunchTask": "build", 17 | "program": "${workspaceFolder}/YtCli/bin/Debug/netcoreapp3.1/ytnetworks.dll", 18 | "args": ["${input:ytcli_verb}"], //["update", "-a", "Search"], 19 | "cwd": "${workspaceFolder}/YtCli", 20 | "stopAtEntry": false, 21 | "console": "internalConsole" 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /App/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "docker.defaultRegistryPath": "ytnetworks.azurecr.io", 3 | "azureFunctions.deploySubpath": "YtFunctions/bin/Release/net6.0/publish", 4 | "azureFunctions.projectLanguage": "C#", 5 | "azureFunctions.projectRuntime": "~3", 6 | "debug.internalConsoleOptions": "neverOpen", 7 | "azureFunctions.preDeployTask": "publish", 8 | "cSpell.words": [ 9 | "Jsonl", 10 | "Mutuo", 11 | "dataform", 12 | "josnl" 13 | ], 14 | "files.exclude": { 15 | "**/bin": true, 16 | "**/obj": true, 17 | "**/.data": true 18 | }, 19 | "workbench.colorCustomizations": { 20 | "[Default Dark+]": { 21 | "activityBar.background": "#302230", 22 | "statusBar.background": "#974597" 23 | } 24 | }, 25 | "json.schemas": [ 26 | { 27 | "fileMatch": [ 28 | "*.appcfg.json" 29 | ], 30 | "url": "./YtReader/AppCfg.schema.json" 31 | }, 32 | { 33 | "fileMatch": [ 34 | "*.rootcfg.json" 35 | ], 36 | "url": "./YtReader/RootCfg.schema.json" 37 | } 38 | ], 39 | } -------------------------------------------------------------------------------- /App/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG SEMVER 2 | ARG ASSEMBLY_SEMVER 3 | 4 | FROM mcr.microsoft.com/dotnet/sdk:6.0-alpine AS build-env 5 | 6 | COPY YtCli YtCli/ 7 | COPY YtReader YtReader/ 8 | COPY SysExtensions SysExtensions/ 9 | COPY Mutuo.Etl Mutuo.Etl/ 10 | COPY Mutuo.Tools Mutuo.Tools/ 11 | 12 | ARG SEMVER 13 | ARG ASSEMBLY_SEMVER 14 | RUN echo SemVer={$SEMVER} AssemblySemVer={$ASSEMBLY_SEMVER} 15 | 16 | WORKDIR /YtCli 17 | RUN dotnet publish -c Release -o publish /p:Version=${ASSEMBLY_SEMVER} /p:InformationalVersion=${SEMVER} 18 | 19 | # build runtime image 20 | FROM mcr.microsoft.com/dotnet/runtime:6.0-alpine 21 | WORKDIR /app 22 | COPY --from=build-env YtCli/publish ./ 23 | 24 | # humanizer and possibly other libraries rely on having a culture 25 | ENV DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=false 26 | RUN apk add icu-libs 27 | # it is more flexible to work without the entrypoint. And azure doesn't allow args when entrypoint is used. 28 | #ENTRYPOINT ["dotnet", "ytnetworks.dll"] 29 | -------------------------------------------------------------------------------- /App/GitVersion.yml: -------------------------------------------------------------------------------- 1 | mode: ContinuousDelivery 2 | branches: {} 3 | ignore: 4 | sha: [] 5 | merge-message-formats: {} 6 | -------------------------------------------------------------------------------- /App/Mutuo.Etl/AzureManagement/AzureEx.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Azure.Management.Fluent; 2 | using Microsoft.Azure.Management.ResourceManager.Fluent; 3 | using Microsoft.Azure.Management.ResourceManager.Fluent.Authentication; 4 | using Mutuo.Etl.Pipe; 5 | using Az = Microsoft.Azure.Management.Fluent.Azure; 6 | 7 | namespace Mutuo.Etl.AzureManagement; 8 | 9 | public static class AzureEx { 10 | public static IAzure GetAzure(this PipeAzureCfg cfg) { 11 | var sp = cfg.ServicePrincipal; 12 | var creds = new AzureCredentialsFactory().FromServicePrincipal(sp.ClientId, sp.Secret, sp.TennantId, AzureEnvironment.AzureGlobalCloud); 13 | var azure = Az.Authenticate(creds).WithSubscription(cfg.SubscriptionId); 14 | return azure; 15 | } 16 | } -------------------------------------------------------------------------------- /App/Mutuo.Etl/Blob/AzureBlobExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using Azure.Storage.Blobs; 3 | 4 | namespace Mutuo.Etl.Blob; 5 | 6 | public static class AzureBlobExtensions { 7 | public static async Task LoadAsText(this BlobClient blobClient) { 8 | using var memoryStream = new MemoryStream(); 9 | await blobClient.DownloadToAsync(memoryStream); 10 | var text = memoryStream.ToArray().ToStringFromUtf8(); 11 | return text; 12 | } 13 | } -------------------------------------------------------------------------------- /App/Mutuo.Etl/Blob/KeyedCollectionStore.cs: -------------------------------------------------------------------------------- 1 | namespace Mutuo.Etl.Blob; 2 | 3 | /// Ready/write to storage for a keyed collection of items 4 | /// 5 | public class KeyedCollectionStore where T : class { 6 | public KeyedCollectionStore(ISimpleFileStore store, Func getId, SPath path) { 7 | Store = store; 8 | GetId = getId; 9 | Path = path; 10 | } 11 | 12 | ISimpleFileStore Store { get; } 13 | Func GetId { get; } 14 | SPath Path { get; } 15 | 16 | public async Task Get(string id) => await Store.GetState(Path.Add(id)); 17 | public async Task Set(T item) => await Store.SetState(Path.Add(GetId(item)), item); 18 | } -------------------------------------------------------------------------------- /App/Mutuo.Etl/Blob/LocalSimpleFileStore.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | 3 | namespace Mutuo.Etl.Blob; 4 | 5 | public class LocalSimpleFileStore : ISimpleFileStore { 6 | readonly FPath Dir; 7 | public LocalSimpleFileStore(FPath dir) => Dir = dir; 8 | 9 | public SPath BasePath => ""; 10 | 11 | public Task Save(SPath path, FPath file, ILogger log = null) { 12 | Dir.Combine(path).EnsureDirectoryExists(); 13 | File.Copy(file.FullPath, Dir.Combine(path).FullPath, overwrite: true); 14 | return Task.CompletedTask; 15 | } 16 | 17 | public async Task Save(SPath path, Stream contents, ILogger log = null) { 18 | var file = ToFPath(path); 19 | file.EnsureDirectoryExists(); 20 | using var ws = file.Open(FileMode.Create); 21 | await contents.CopyToAsync(ws); 22 | } 23 | 24 | public Task Load(SPath path, ILogger log = null) { 25 | Stream s = ToFPath(path).Open(FileMode.Open); 26 | return Task.FromResult(s); 27 | } 28 | 29 | public Task LoadToFile(SPath path, FPath file, ILogger log = null) => throw new NotImplementedException(); 30 | 31 | #pragma warning disable 1998 32 | public async IAsyncEnumerable> List(SPath path, bool allDirectories = false) { 33 | #pragma warning restore 1998 34 | 35 | var dir = ToFPath(path); 36 | if (!dir.Exists) yield break; 37 | var files = dir.Files(recursive: allDirectories); 38 | var res = files.Select(AsListItem).ToArray(); 39 | yield return res; 40 | } 41 | 42 | public Task Delete(SPath path, ILogger log = null) { 43 | var p = ToFPath(path); 44 | var exists = p.Exists; 45 | if (exists) 46 | p.Delete(); 47 | return Task.FromResult(exists); 48 | } 49 | 50 | public Task Info(SPath path) => Task.FromResult(AsListItem(ToFPath(path))); 51 | 52 | public Uri Url(SPath path) => $"file://{ToFPath(path).FullPath}".AsUri(); 53 | public Task Exists(SPath path) => throw new NotImplementedException(); 54 | 55 | FPath ToFPath(SPath path) => Dir.Combine(path.Tokens.ToArray()); 56 | static SPath ToSPath(FPath f) => new(f.Tokens); 57 | 58 | FileListItem AsListItem(FPath f) => new(ToSPath(f).RelativePath(ToSPath(Dir)), f.LastWriteTime()); 59 | 60 | public Task OpenForWrite(SPath path, ILogger log = null) { 61 | var p = ToFPath(path); 62 | p.EnsureDirectoryExists(); 63 | var s = (Stream)p.Open(FileMode.Create, FileAccess.Write); 64 | return Task.FromResult(s); 65 | } 66 | } -------------------------------------------------------------------------------- /App/Mutuo.Etl/GlobalUsings.cs: -------------------------------------------------------------------------------- 1 | global using System; 2 | global using SysExtensions; 3 | global using SysExtensions.Collections; 4 | global using SysExtensions.Net; 5 | global using SysExtensions.Serialization; 6 | global using SysExtensions.Text; 7 | global using SysExtensions.Threading; 8 | global using System.Linq; 9 | global using System.Net.Http; 10 | global using Serilog; 11 | global using System.Text.RegularExpressions; 12 | global using Humanizer; 13 | global using System.Threading.Tasks; 14 | global using System.Collections.Generic; 15 | global using System.Threading; 16 | global using SysExtensions.IO; -------------------------------------------------------------------------------- /App/Mutuo.Etl/Mutuo.Etl.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | net6.0 4 | 10 5 | warnings 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /App/Mutuo.Etl/Pipe/ContainerLauncher.cs: -------------------------------------------------------------------------------- 1 | using Autofac; 2 | 3 | namespace Mutuo.Etl.Pipe; 4 | 5 | public interface IContainerLauncher { 6 | /// Runs the given container untill completion 7 | Task RunContainer(string containerName, string fullImageName, (string name, string value)[] envVars, 8 | string[] args = null, 9 | bool returnOnStart = false, 10 | string exe = null, 11 | string groupName = null, 12 | ContainerCfg cfg = null, 13 | ILogger log = null, 14 | CancellationToken cancel = default); 15 | } 16 | 17 | public class ContainerLauncher : IContainerLauncher { 18 | readonly PipeAppCfg Cfg; 19 | readonly IPipeCtx Ctx; 20 | 21 | public ContainerLauncher(PipeAppCfg cfg, IPipeCtx ctx) { 22 | Cfg = cfg; 23 | Ctx = ctx; 24 | } 25 | 26 | public async Task RunContainer(string containerName, string fullImageName, (string name, string value)[] envVars, 27 | string[] args = null, 28 | bool returnOnStart = false, 29 | string exe = null, 30 | string groupName = null, 31 | ContainerCfg cfg = null, 32 | ILogger log = null, 33 | CancellationToken cancel = default) { 34 | IContainerLauncher launcher = Cfg.Location switch { 35 | PipeRunLocation.Container => Ctx.Scope.Resolve(), 36 | _ => Ctx.Scope.Resolve() 37 | }; 38 | await launcher.RunContainer(containerName, fullImageName, envVars, args, returnOnStart, exe, groupName, cfg, log, cancel); 39 | } 40 | } -------------------------------------------------------------------------------- /App/Mutuo.Etl/Pipe/DependencyGraph.cs: -------------------------------------------------------------------------------- 1 | using System.Linq.Expressions; 2 | 3 | namespace Mutuo.Etl.Pipe; 4 | 5 | public class DependencyGraph { 6 | readonly MultiValueDictionary DepsByFrom = new MultiValueDictionary(); 7 | readonly MultiValueDictionary DepsByTo = new MultiValueDictionary(); 8 | readonly IKeyedCollection _nodes; 9 | readonly Expression> _getKey; 10 | 11 | public DependencyGraph(IEnumerable nodes, Func> getDependencies, Expression> getKey) { 12 | _getKey = getKey; 13 | _nodes = new KeyedCollection(getKey); 14 | _nodes.AddRange(nodes); 15 | 16 | foreach (var node in _nodes) 17 | foreach (var d in getDependencies(node)) 18 | AddDependency(GetKey(node), d); 19 | } 20 | 21 | public T this[string key] => _nodes[key]; 22 | 23 | string GetKey(T item) => _nodes.GetKey(item); 24 | 25 | public IReadOnlyCollection Nodes => _nodes.ToList(); 26 | 27 | public void AddDependency(string from, string to) { 28 | DepsByFrom.Add(from, to); 29 | DepsByTo.Add(to, from); 30 | } 31 | 32 | public IEnumerable Dependencies(T node) => DepsByFrom.TryGet(GetKey(node)).Select(to => _nodes[to]).NotNull(); 33 | 34 | public IEnumerable DependenciesDeep(T node) { 35 | var discoveredDeps = new KeyedCollection(_getKey); 36 | 37 | IEnumerable InnerDescendentDeps(T n) { 38 | var childDeps = Dependencies(n).ToList(); 39 | foreach (var dep in childDeps.Where(c => !discoveredDeps.Contains(c))) 40 | yield return discoveredDeps.AddItem(dep); 41 | 42 | foreach (var dep in childDeps.SelectMany(InnerDescendentDeps)) 43 | yield return dep; 44 | } 45 | 46 | foreach (var dep in InnerDescendentDeps(node)) 47 | yield return dep; 48 | } 49 | 50 | public IEnumerable Dependants(T node) => DepsByTo.TryGet(GetKey(node)).Select(from => _nodes[from]); 51 | } -------------------------------------------------------------------------------- /App/Mutuo.Etl/Pipe/LocalPipeWorker.cs: -------------------------------------------------------------------------------- 1 | using Medallion.Shell; 2 | using Semver; 3 | 4 | namespace Mutuo.Etl.Pipe; 5 | 6 | public class LocalPipeWorker : IPipeWorker, IContainerLauncher { 7 | readonly SemVersion Version; 8 | 9 | public LocalPipeWorker(SemVersion version) => Version = version; 10 | 11 | public async Task> Launch(IPipeCtx ctx, IReadOnlyCollection ids, ILogger log, CancellationToken cancel) => 12 | await ids.BlockDo(async id => { 13 | var runCfg = id.PipeCfg(ctx.PipeCfg); 14 | var image = runCfg.Container.FullContainerImageName(Version.PipeTag()); 15 | var args = new[] { "run" } 16 | .Concat(ctx.AppCtx.EnvironmentVariables.SelectMany(e => new[] { "--env", $"{e.name}={e.value}" })) 17 | .Concat("--rm", "-i", image) 18 | .Concat(runCfg.Container.Exe) 19 | .Concat(id.PipeArgs()) 20 | .ToArray(); 21 | var cmd = Command.Run("docker", args, o => o.CancellationToken(cancel)).RedirectTo(Console.Out); 22 | var res = await cmd.Task; 23 | PipeRunMetadata md = res.Success 24 | ? new() { Id = id } 25 | : new() { 26 | Id = id, 27 | ErrorMessage = await cmd.StandardError.ReadToEndAsync() 28 | }; 29 | await md.Save(ctx.Store, log); 30 | return md; 31 | }).ToArrayAsync(); 32 | 33 | public async Task RunContainer(string containerName, string fullImageName, (string name, string value)[] envVars, string[] args = null, 34 | bool returnOnStart = false, string exe = null, 35 | string groupName = null, ContainerCfg cfg = null, ILogger log = null, CancellationToken cancel = default) { 36 | groupName ??= containerName; 37 | var dockerArgs = new[] { "run" } 38 | .Concat(envVars.SelectMany(e => new[] { "--env", $"{e.name}={e.value}" })) 39 | .Concat("--rm", "-i", fullImageName) 40 | .Concat(exe) 41 | .Concat(args) 42 | .NotNull() 43 | .ToArray(); 44 | log?.Debug($"LocalPipeWorker - launching docker: docker {exe ?? ""} {dockerArgs.Join(" ", o => o.ToString())}"); 45 | var cmd = Command.Run("docker", dockerArgs, o => o.CancellationToken(cancel)).RedirectTo(Console.Out); 46 | var res = await cmd.Task; 47 | if (!res.Success) throw new InvalidOperationException($"Container {groupName} failed ({res.ExitCode}): {res.StandardError}"); 48 | } 49 | } -------------------------------------------------------------------------------- /App/Mutuo.Tools/Mutuo.Tools.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | Exe 4 | net6.0 5 | 10 6 | 7 | 8 | 9 | 1701;1702;1591 10 | 11 | 12 | 13 | 1701;1702;1591 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /App/Mutuo.Tools/Program.cs: -------------------------------------------------------------------------------- 1 | using System.Threading.Tasks; 2 | using Autofac; 3 | using CliFx; 4 | using Serilog; 5 | using Serilog.Events; 6 | 7 | namespace Mutuo.Tools; 8 | 9 | class Program { 10 | static async Task Main(string[] args) { 11 | var log = new LoggerConfiguration() 12 | .WriteTo.Console(LogEventLevel.Information).CreateLogger(); 13 | var cb = new ContainerBuilder(); 14 | cb.Register(_ => log); 15 | cb.RegisterAssemblyTypes(typeof(Program).Assembly).AssignableTo(); 16 | using var scope = cb.Build(); 17 | 18 | var app = new CliApplicationBuilder() 19 | .AddCommandsFromThisAssembly() 20 | .UseTypeActivator(t => scope.Resolve(t)) 21 | .SetTitle("Mutuo Tools") 22 | .Build(); 23 | 24 | return await app.RunAsync(args); 25 | } 26 | } -------------------------------------------------------------------------------- /App/SysExtensions/AutofacExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace SysExtensions; 2 | 3 | /// Use for generic type interence 4 | /// the type of the parameter 5 | public static class Typ { 6 | public static Of Of() => new Of(); 7 | 8 | public static string Hell() => "string"; 9 | } 10 | 11 | public class Of { } -------------------------------------------------------------------------------- /App/SysExtensions/Build/GitVersionInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using Medallion.Shell; 3 | using Newtonsoft.Json.Linq; 4 | using Semver; 5 | using Serilog.Core; 6 | using SysExtensions.IO; 7 | using SysExtensions.Text; 8 | 9 | namespace SysExtensions.Build; 10 | 11 | public class GitVersionInfo { 12 | public string SemVer { get; set; } 13 | public string FullSemVer { get; set; } 14 | public string BranchName { get; set; } 15 | public string MajorMinorPatch { get; set; } 16 | public string NuGetVersionV2 { get; set; } 17 | 18 | public int Major { get; set; } 19 | public int Minor { get; set; } 20 | public int Path { get; set; } 21 | 22 | /// Use github to work out the current version in dev, will use the curerent machine as the branch name. 23 | /// devVersionInfo will be null when not run in a dev environment 24 | public static async Task<(SemVersion version, GitVersionInfo info)> DiscoverVersion(Type typeToDetectVersion, ILogger log = null) { 25 | log ??= Log.Logger ?? Logger.None; 26 | var rootPath = FPath.WorkingDir.DirOfParent(".git")?.Parent(); 27 | if (rootPath?.Exists == true) { 28 | var outputLines = new List(); 29 | var appDir = rootPath.Combine("App"); 30 | var shell = new Shell(o => o.WorkingDirectory(appDir.FullPath)); 31 | var process = shell.Run("dotnet", "gitversion"); 32 | await process.StandardOutput.PipeToAsync(outputLines); 33 | await process.Task; 34 | try { 35 | var jVersion = JObject.Parse(outputLines.Join("\n")); 36 | var gitVersion = jVersion.ToObject(); 37 | 38 | log.Debug("{Noun} - '.git/' detected. Discovered version: {Version}", nameof(GitVersionInfo), gitVersion.SemVer); 39 | 40 | return (SemVersion.Parse(gitVersion.SemVer), gitVersion); 41 | } 42 | catch (Exception ex) { 43 | log?.Error($"Unable to parse result from gitversion: {outputLines.Join(" ")}", ex); 44 | } 45 | } 46 | var assemblyVersion = RuntimeSemVer(typeToDetectVersion); 47 | log?.Debug("{Noun} - Using assembly version: {Version}", nameof(GitVersionInfo), assemblyVersion); 48 | return (assemblyVersion, null); 49 | } 50 | 51 | public static SemVersion RuntimeSemVer(Type type) => 52 | SemVersion.Parse(type.Assembly.GetCustomAttribute()?.InformationalVersion ?? 53 | throw new InvalidOperationException($"Can't find {type.Assembly.GetName().Name} InformationalVersion")); 54 | } 55 | 56 | public static class SemVerEx { 57 | public static string MajorMinorPatch(this SemVersion v) => $"{v.Major}.{v.Minor}.{v.Patch}"; 58 | } -------------------------------------------------------------------------------- /App/SysExtensions/Collections/AsyncEnumerableExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Runtime.CompilerServices; 2 | using SysExtensions.Reflection; 3 | 4 | namespace SysExtensions.Collections; 5 | 6 | public static class AsyncEnumerableExtensions { 7 | public static async IAsyncEnumerable SelectMany(this IAsyncEnumerable> items) { 8 | await foreach (var g in items) 9 | foreach (var i in g) 10 | yield return i; 11 | } 12 | 13 | /// Like TakeWhile, but will include the first element that doesn't meet the predicate 14 | public static async IAsyncEnumerable TakeWhileInclusive(this IAsyncEnumerable source, Func predicate 15 | , [EnumeratorCancellation] CancellationToken cancel = default) { 16 | await foreach (var e in source.WithCancellation(cancel).ConfigureAwait(false)) { 17 | yield return e; 18 | if (!predicate(e)) 19 | break; 20 | } 21 | } 22 | 23 | public static IAsyncEnumerable NotNull(this IAsyncEnumerable items) => items.Where(i => !i.NullOrDefault()); 24 | 25 | /// Batch into size chunks lazily 26 | public static async IAsyncEnumerable Batch(this IAsyncEnumerable items, int size) { 27 | var batch = new List(); 28 | await foreach (var item in items) { 29 | batch.Add(item); 30 | if (batch.Count < size) continue; 31 | yield return batch.ToArray(); 32 | batch.Clear(); 33 | } 34 | if (batch.Count > 0) 35 | yield return batch.ToArray(); 36 | } 37 | 38 | public static IAsyncEnumerable<(T item, int index)> WithIndex(this IAsyncEnumerable items) => items.Select((item, index) => (item, index)); 39 | } -------------------------------------------------------------------------------- /App/SysExtensions/Collections/QueueExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace SysExtensions.Collections; 2 | 3 | public static class QueueExtensions { 4 | public static IEnumerable Dequeue(this Queue queue, int number) { 5 | for (var i = 0; i < number; i++) { 6 | var t = queue.TryDequeue(); 7 | if (t == null) break; 8 | yield return t; 9 | } 10 | } 11 | 12 | public static void Enqueue(this Queue queue, IEnumerable items) { 13 | foreach (var item in items) 14 | queue.Enqueue(item); 15 | } 16 | 17 | public static Queue ToQueue(this IEnumerable items) => new Queue(items); 18 | 19 | public static Stack ToStack(this IEnumerable items) => new Stack(items); 20 | 21 | public static T TryPop(this Stack stack) => stack.Count > 0 ? stack.Pop() : default; 22 | 23 | public static T TryDequeue(this Queue queue) => queue.Count > 0 ? queue.Dequeue() : default; 24 | } -------------------------------------------------------------------------------- /App/SysExtensions/DateTimeExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | 3 | namespace SysExtensions; 4 | 5 | public static class DateTimeExtensions { 6 | public static string FileSafeTimestamp(this DateTime value) => value.ToString("yyyy-MM-dd_HH-mm-ss-fffffff", CultureInfo.InvariantCulture); 7 | public static DateTime ParseFileSafeTimestamp(this string ts) => DateTime.ParseExact(ts, "yyyy-MM-dd_HH-mm-ss-fffffff", CultureInfo.InvariantCulture); 8 | 9 | public static DateTime ParseDate(this string s, IFormatProvider format = default, DateTimeStyles style = default) => DateTime.Parse(s, format, style); 10 | 11 | public static DateTime? TryParseDate(this string s, IFormatProvider format = default, DateTimeStyles style = DateTimeStyles.None) => 12 | DateTime.TryParse(s, format ?? CultureInfo.InvariantCulture, style, out var d) ? d : null; 13 | 14 | public static DateTime? TryParseDateExact(this string s, string format, DateTimeStyles style = DateTimeStyles.None) => 15 | DateTime.TryParseExact(s, format, CultureInfo.InvariantCulture, style, out var d) ? d : null; 16 | 17 | public static string DateString(this DateTime value) => value.ToString("yyyy-MM-dd"); 18 | public static DateTime Epoc { get; } = new DateTime(year: 1970, month: 1, day: 1); 19 | 20 | public static TimeSpan? TryParseTimeSpan(this string s) => TimeSpan.TryParse(s, CultureInfo.InvariantCulture, out var d) ? d : null; 21 | 22 | public static TimeSpan? TryParseTimeSpanExact(this string s, params string[] formats) => 23 | TimeSpan.TryParseExact(s, formats, CultureInfo.InvariantCulture, out var d) ? d : null; 24 | 25 | public static DateTime Before(this TimeSpan ts, DateTime date) => date - ts; 26 | public static DateTime After(this TimeSpan ts, DateTime date) => date + ts; 27 | 28 | public static TimeSpan Age(this DateTime value) => DateTime.UtcNow - value; 29 | } -------------------------------------------------------------------------------- /App/SysExtensions/GlobalUsings.cs: -------------------------------------------------------------------------------- 1 | global using System; 2 | global using System.Linq; 3 | global using System.Net.Http; 4 | global using Serilog; 5 | global using System.Text.RegularExpressions; 6 | global using System.Threading.Tasks; 7 | global using System.Collections.Generic; 8 | global using System.Threading; -------------------------------------------------------------------------------- /App/SysExtensions/IO/CsvExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | using System.IO; 3 | using CsvHelper; 4 | using CsvHelper.Configuration; 5 | 6 | namespace SysExtensions.IO; 7 | 8 | public static class CsvExtensions { 9 | public static void WriteToCsv(this IEnumerable values, FPath path, CsvConfiguration cfg = null) { 10 | using (var fs = path.Open(FileMode.Create)) 11 | using (var tw = new StreamWriter(fs)) { 12 | cfg ??= new CsvConfiguration(CultureInfo.InvariantCulture); 13 | var csv = new CsvWriter(tw, cfg); 14 | csv.WriteRecords(values); 15 | } 16 | } 17 | 18 | public static CsvConfiguration DefaultConfig => new CsvConfiguration(CultureInfo.InvariantCulture) 19 | { AllowComments = true, IgnoreBlankLines = true, TrimOptions = TrimOptions.Trim, MissingFieldFound = null }; 20 | 21 | public static ICollection ReadFromCsv(this FPath path, CsvConfiguration cfg = null) { 22 | cfg ??= DefaultConfig; 23 | using (var fs = path.OpenText()) { 24 | var csv = new CsvReader(fs, cfg); 25 | return csv.GetRecords().ToList(); 26 | } 27 | } 28 | 29 | public static ICollection ReadFromCsv(string data, CsvConfiguration cfg = null) { 30 | cfg = cfg ?? DefaultConfig; 31 | using (var tr = new StringReader(data)) { 32 | var csv = new CsvReader(tr, cfg); 33 | return csv.GetRecords().ToList(); 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /App/SysExtensions/IO/IOExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using Humanizer; 3 | using Humanizer.Bytes; 4 | using SysExtensions.Threading; 5 | 6 | namespace SysExtensions.IO; 7 | 8 | public static class IOExtensions { 9 | public static IEnumerable ParentDirectories(this FileInfo file) { 10 | var dir = file.Directory; 11 | while (true) { 12 | yield return dir; 13 | dir = dir.Parent; 14 | if (dir == null || !dir.Exists) 15 | break; 16 | } 17 | } 18 | 19 | public static IEnumerable RelativePath(this FileInfo file, DirectoryInfo root) { 20 | var path = file.ParentDirectories().TakeWhile(d => d.FullName != root.FullName); 21 | return path.Reverse(); 22 | } 23 | 24 | public static async Task CopyToAsync(this Stream source, Stream dest, Action onProgress, CancellationToken cancel = default, 25 | ByteSize? bufferBytes = null, TimeSpan? progressCadence = null) { 26 | bufferBytes ??= 100.Kilobytes(); 27 | progressCadence ??= 1.Seconds(); 28 | 29 | var buffer = new byte[(int)bufferBytes.Value.Bytes]; 30 | int count; 31 | var transferred = 0L; 32 | 33 | CancellationTokenSource innerCancel = new(); 34 | cancel.Register(() => innerCancel.Cancel()); 35 | 36 | async Task Progress() { 37 | while (!innerCancel.IsCancellationRequested) { 38 | await progressCadence.Value.Delay(innerCancel.Token).Swallow(); 39 | if (!innerCancel.IsCancellationRequested) 40 | // ReSharper disable once AccessToModifiedClosure - intentional 41 | onProgress(transferred); 42 | } 43 | } 44 | 45 | var progTask = Progress(); 46 | while ((count = await source.ReadAsync(buffer.AsMemory(start: 0, buffer.Length))) != 0) { 47 | Interlocked.Add(ref transferred, count); 48 | await dest.WriteAsync(buffer, offset: 0, count); 49 | } 50 | innerCancel.Cancel(); 51 | await progTask; 52 | } 53 | } -------------------------------------------------------------------------------- /App/SysExtensions/LogExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace SysExtensions; 2 | 3 | public static class LogExtensions { 4 | public static ILogger Scope(this ILogger log, string scope) => log.ForContext("Scope", scope); 5 | } -------------------------------------------------------------------------------- /App/SysExtensions/Net/FunctionExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Net; 2 | 3 | namespace SysExtensions.Net; 4 | 5 | public static class FunctionExtensions { 6 | public static HttpResponseMessage AsyncResponse(this HttpRequestMessage req, string message) => new HttpResponseMessage(HttpStatusCode.OK) 7 | { RequestMessage = req, Content = new StringContent(message) }; 8 | } -------------------------------------------------------------------------------- /App/SysExtensions/Net/HttpExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Net; 2 | 3 | namespace SysExtensions.Net; 4 | 5 | public static class HttpExtensions { 6 | public static UriBuilder Build(this Uri uri) => new(uri); 7 | public static Uri AsUri(this string url) => new(url); 8 | 9 | public static void EnsureSuccess(int status, string url) { 10 | if (!IsSuccess(status)) throw new HttpRequestException($"{url} failed with '{status}'", inner: null, (HttpStatusCode)status); 11 | } 12 | 13 | public static void EnsureSuccess(this HttpStatusCode status, string url) => EnsureSuccess((int)status, url); 14 | public static bool IsSuccess(int code) => code >= 200 && code <= 299; 15 | public static bool IsSuccess(this HttpStatusCode code) => IsSuccess((int)code); 16 | 17 | public static bool IsTransientError(int code) => !IsSuccess(code) && code switch { 18 | < 500 => code.In(408, 429, 425), 19 | _ => true 20 | }; 21 | 22 | public static bool IsTransientError(this HttpStatusCode code) => IsTransientError((int)code); 23 | } -------------------------------------------------------------------------------- /App/SysExtensions/Net/Policies.cs: -------------------------------------------------------------------------------- 1 | using Humanizer; 2 | using Polly; 3 | using Polly.Retry; 4 | using Troschuetz.Random; 5 | 6 | namespace SysExtensions.Net; 7 | 8 | public static class Policies { 9 | const double DeviationPercent = 0.2; 10 | static readonly TRandom _rand = new(); 11 | static readonly TimeSpan MinWait = 50.Milliseconds(); 12 | static readonly TimeSpan MaxWait = 5.Minutes(); 13 | 14 | public static TimeSpan ExponentialBackoff(this int attempt, TimeSpan? firstWait = null) { 15 | var firstWaitValue = firstWait ?? MinWait; 16 | var waitValue = firstWaitValue.TotalMilliseconds * Math.Pow(x: 2, attempt - 1); 17 | var waitWithRandomness = _rand.Normal(waitValue, waitValue * DeviationPercent).Milliseconds(); 18 | if (waitWithRandomness < MinWait) waitWithRandomness = MinWait; 19 | if (waitWithRandomness > MaxWait) waitWithRandomness = MaxWait; 20 | return waitWithRandomness; 21 | } 22 | 23 | public static AsyncRetryPolicy RetryWithBackoff(this PolicyBuilder policy, string description, int retryCount = 3, 24 | Action, int, TimeSpan> onError = null, 25 | ILogger log = null) => 26 | policy.RetryAsync(retryCount, async (e, i, _) => { 27 | var delay = i.ExponentialBackoff(1.Seconds()); 28 | if (onError == null) 29 | log?.Debug("retryable error with {Description}: '{Error}'. Retrying in {Duration}, attempt {Attempt}/{Total}", 30 | description, e.Exception?.Message ?? "Unknown error", delay, i, retryCount); 31 | else 32 | onError(e, i, delay); 33 | await Task.Delay(delay); 34 | }); 35 | 36 | public static AsyncRetryPolicy RetryBackoff(this PolicyBuilder policy, string description, int retryCount = 3, TimeSpan? initialDelay = null, 37 | ILogger log = null) => 38 | policy.RetryAsync(retryCount, async (e, i) => { 39 | var delay = i.ExponentialBackoff(initialDelay ?? 1.Seconds()); 40 | log?.Debug("retryable error with {Description}: '{Error}'. Retrying in {Duration}, attempt {Attempt}/{Total}", 41 | description, e.Message, delay, i, retryCount); 42 | await Task.Delay(delay); 43 | }); 44 | } -------------------------------------------------------------------------------- /App/SysExtensions/NumberExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace SysExtensions; 2 | 3 | public static class NumberExtensions { 4 | public static ulong RoundToULong(this double value) => (ulong)Math.Round(value); 5 | public static long RoundToLong(this double value) => (long)Math.Round(value); 6 | public static int RoundToInt(this double value) => (int)Math.Round(value); 7 | public static double Pow(this int x, int y) => Math.Pow(x, y); 8 | public static IEnumerable RangeTo(this int from, int to) => Enumerable.Range(from, to); 9 | public static IEnumerable Range(this int from, int count) => Enumerable.Range(from, from + count); 10 | public static int Abs(this int num) => Math.Abs(num); 11 | public static bool Between(this int num, int from, int to) => num >= from && num <= to; 12 | } -------------------------------------------------------------------------------- /App/SysExtensions/Reflection/ExpressionExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Linq.Expressions; 2 | 3 | namespace SysExtensions.Reflection; 4 | 5 | public static class ExpressionExtensions { 6 | public static object GetValue(this Expression expression) => Expression.Lambda(expression).Compile().DynamicInvoke(); 7 | } -------------------------------------------------------------------------------- /App/SysExtensions/Security/NameSecret.cs: -------------------------------------------------------------------------------- 1 | using System.ComponentModel; 2 | using System.Security; 3 | using SysExtensions.Collections; 4 | using SysExtensions.Serialization; 5 | using SysExtensions.Text; 6 | 7 | namespace SysExtensions.Security; 8 | 9 | /// Credentials for a user (in the format name:secret). Be careful not to serialize this. it is not encrypted 10 | [TypeConverter(typeof(StringConverter))] 11 | public sealed class NameSecret : IStringConvertableWithPattern { 12 | public NameSecret() { } 13 | 14 | public NameSecret(string encodedValue) { 15 | var (name, secret) = Parse(encodedValue); 16 | Name = name; 17 | Secret = secret; 18 | } 19 | 20 | public NameSecret(string name, string secret) { 21 | Name = name; 22 | Secret = secret; 23 | } 24 | 25 | public string Name { get; set; } 26 | public string Secret { get; set; } 27 | 28 | public string StringValue { 29 | get => $"{Name}:{Secret}"; 30 | set { 31 | var (name, secret) = Parse(value); 32 | Name = name; 33 | Secret = secret; 34 | } 35 | } 36 | 37 | public string Pattern => @"([^:\n]+):([^:\n]+)"; 38 | 39 | public override string ToString() => StringValue; 40 | 41 | static (string name, string secret) Parse(string value) { 42 | var tokens = value.UnJoin(':').ToQueue(); 43 | var name = tokens.TryDequeue(); 44 | var secret = tokens.TryDequeue(); 45 | return (name, secret); 46 | } 47 | 48 | public SecureString SecureString() { 49 | var ss = new SecureString(); 50 | foreach (var c in Secret) 51 | ss.AppendChar(c); 52 | ss.MakeReadOnly(); 53 | return ss; 54 | } 55 | } -------------------------------------------------------------------------------- /App/SysExtensions/Serialization/CoreSerializeContractResolver.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.ComponentModel; 3 | using System.Reflection; 4 | using Newtonsoft.Json; 5 | using Newtonsoft.Json.Serialization; 6 | using SysExtensions.Reflection; 7 | 8 | namespace SysExtensions.Serialization; 9 | 10 | /// Camel case properties. Also when using OptOut, then only public properties that are writable are serialized by 11 | /// default 12 | public class CoreSerializeContractResolver : CamelCasePropertyNamesContractResolver { 13 | protected override JsonProperty CreateProperty(MemberInfo member, MemberSerialization memberSerialization) { 14 | var prop = base.CreateProperty(member, memberSerialization); 15 | if (memberSerialization != MemberSerialization.OptOut) return prop; 16 | 17 | // classes with no default constructors should serialize as per normal 18 | var emptyConstructor = member.DeclaringType?.GetConstructor(Type.EmptyTypes); 19 | if (emptyConstructor != null) return prop; 20 | 21 | // by default only writable properties should be serialized 22 | if (!prop.Writable && !prop.PropertyType.IsCollection() 23 | && member.GetCustomAttribute(true) == null) 24 | return null; 25 | return prop; 26 | } 27 | 28 | /// Determines which contract type is created for the given type. 29 | /// Type of the object. 30 | /// A for the given type. 31 | protected override JsonContract CreateContract(Type objectType) { 32 | var contract = base.CreateContract(objectType); 33 | 34 | // by default a type that can convert to string and that is also an enum will have an array contract, but serialize to a string!. fix this 35 | if (contract is JsonArrayContract && typeof(IEnumerable).IsAssignableFrom(objectType) && 36 | CanNonSystemTypeDescriptorConvertString(objectType, out var converter)) 37 | contract = CreateStringContract(objectType); 38 | return contract; 39 | } 40 | 41 | public static bool CanNonSystemTypeDescriptorConvertString(Type type, out TypeConverter typeConverter) { 42 | typeConverter = TypeDescriptor.GetConverter(type); 43 | 44 | // use the objectType's TypeConverter if it has one and can convert to a string 45 | var converterType = typeConverter.GetType(); 46 | if (!converterType.FullName.StartsWith("System.ComponentModel") && converterType != typeof(TypeConverter)) { 47 | var canConvert = typeConverter.CanConvertTo(typeof(string)); 48 | return canConvert; 49 | } 50 | return false; 51 | } 52 | } -------------------------------------------------------------------------------- /App/SysExtensions/Serialization/XmlExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Xml.Linq; 2 | 3 | namespace SysExtensions.Serialization; 4 | 5 | public static class XmlExtensions { 6 | public static XElement StripNamespaces(this XElement element) { 7 | // Original code credit: http://stackoverflow.com/a/1147012 8 | 9 | var result = new XElement(element); 10 | foreach (var e in result.DescendantsAndSelf()) { 11 | e.Name = XNamespace.None.GetName(e.Name.LocalName); 12 | var attributes = e.Attributes() 13 | .Where(a => !a.IsNamespaceDeclaration) 14 | .Where(a => a.Name.Namespace != XNamespace.Xml && a.Name.Namespace != XNamespace.Xmlns) 15 | .Select(a => new XAttribute(XNamespace.None.GetName(a.Name.LocalName), a.Value)); 16 | e.ReplaceAttributes(attributes); 17 | } 18 | 19 | return result; 20 | } 21 | } -------------------------------------------------------------------------------- /App/SysExtensions/ShortGuid.cs: -------------------------------------------------------------------------------- 1 | using SysExtensions.Text; 2 | 3 | namespace SysExtensions; 4 | 5 | public static class ShortGuid { 6 | public static string ToShortString(this Guid guid, int? length = null) { 7 | var base64Guid = Convert.ToBase64String(guid.ToByteArray()) 8 | .Replace(oldChar: '+', newChar: '-').Replace(oldChar: '/', newChar: '_'); 9 | var s = base64Guid.Substring(startIndex: 0, base64Guid.Length - 2); 10 | return length.HasValue ? s.Right(length.Value) : s; 11 | } 12 | 13 | public static string Create(int? length = null) => Guid.NewGuid().ToShortString(length); 14 | } -------------------------------------------------------------------------------- /App/SysExtensions/SysExtensions.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | net6.0 4 | 10 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /App/SysExtensions/Threading/Defer.cs: -------------------------------------------------------------------------------- 1 | using Nito.AsyncEx; 2 | using SysExtensions.Reflection; 3 | 4 | namespace SysExtensions.Threading; 5 | 6 | public class Defer : IAsyncDisposable { 7 | readonly SemaphoreSlim _lock = new SemaphoreSlim(initialCount: 1, maxCount: 1); 8 | public Defer(Func> creator) => Creator = creator; 9 | Func> Creator { get; } 10 | public T Value; 11 | 12 | public async Task GetOrCreate() { 13 | if (!Value.NullOrDefault()) 14 | return Value; 15 | using (await _lock.LockAsync()) { 16 | if (!Value.NullOrDefault()) 17 | return Value; // check a second time within the lock to avoid race condition and needless locking 18 | Value = await Creator(); 19 | } 20 | return Value; 21 | } 22 | 23 | public async ValueTask DisposeAsync() { 24 | _lock?.Dispose(); 25 | if (Value == null) return; 26 | if (Value is IAsyncDisposable a) await a.DisposeAsync(); 27 | else if (Value is IDisposable d) d.Dispose(); 28 | } 29 | } 30 | 31 | public class Defer { 32 | readonly SemaphoreSlim _lock = new SemaphoreSlim(initialCount: 1, maxCount: 1); 33 | public Defer(Func> creator) => Creator = creator; 34 | Func> Creator { get; } 35 | public T Value; 36 | 37 | public async Task GetOrCreate(TParam param) { 38 | if (!Value.NullOrDefault()) 39 | return Value; 40 | using (await _lock.LockAsync()) { 41 | if (!Value.NullOrDefault()) 42 | return Value; // check a second time within the lock to avoid race condition and needless locking 43 | Value = await Creator(param); 44 | } 45 | return Value; 46 | } 47 | } -------------------------------------------------------------------------------- /App/SysExtensions/ValueExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace SysExtensions; 2 | 3 | public static class ValueExtensions { 4 | /// Run a func on a thing. Does not run when thing is null. An easy to way make something fluent and null safe. 5 | public static TR Dot(this T? thing, Func fun) where T : struct => thing.HasValue ? fun(thing.Value) : default; 6 | 7 | /// Run a func on a thing. Does not run when thing is null. An easy to way make something fluent and null safe. 8 | public static TR Dot(this T thing, Func fun) where T : class => thing is null ? default : fun(thing); 9 | 10 | public static T Clamp(this T v, T min, T max) where T : IComparable => v.CompareTo(max) > 0 ? max : v.CompareTo(min) < 0 ? min : v; 11 | 12 | public static T Min(this T a, T b) where T : IComparable => a.CompareTo(b) <= 0 ? a : b; 13 | public static T Max(this T a, T b) where T : IComparable => a.CompareTo(b) >= 0 ? a : b; 14 | } -------------------------------------------------------------------------------- /App/Tests/BlockTest.cs: -------------------------------------------------------------------------------- 1 | using NUnit.Framework; 2 | using static Tests.TestSetup; 3 | 4 | namespace Tests; 5 | 6 | public class BlockTest { 7 | static readonly Random Rand = new(); 8 | 9 | [Test] 10 | public static async Task TestFlatMap() { 11 | var ctx = await TextCtx(); 12 | var log = ctx.Log; 13 | log.Information("TestBatchBlock started"); 14 | 15 | async IAsyncEnumerable AsyncItems(int count, string desc) { 16 | await foreach (var i in Enumerable.Range(start: 0, count).Batch(4).BlockDo(async (b, i) => { 17 | await 1.Seconds().Delay(); 18 | if (i == 3 && desc == "a") { 19 | log.Debug("error thrown"); 20 | throw new("does this stop the thing?"); 21 | } 22 | return b; 23 | })) { 24 | await 1.Seconds().Delay(); 25 | yield return $"{desc} says hello {i.Join(",")}"; 26 | } 27 | } 28 | 29 | //var listA = await AsyncItems(100, "a").ToListAsync(); 30 | var list = new[] { "a", "b" }.Select(s => AsyncItems(count: 20, s)).ToArray().BlockFlatMap(Task.FromResult, parallel: 4); 31 | var res = await list.ToListAsync(); 32 | } 33 | 34 | #pragma warning disable 1998 35 | static async IAsyncEnumerable AsyncRange(int count) { 36 | #pragma warning restore 1998 37 | foreach (var i in Enumerable.Range(start: 0, count)) yield return i; 38 | } 39 | 40 | [Test] 41 | public static async Task TestChainedBlocks() { 42 | using var ctx = await TextCtx(); 43 | var log = ctx.Log; 44 | 45 | var res = await AsyncRange(10).BlockDo(async i => { 46 | await 1.Seconds().Delay(); 47 | log.Information("{i}a", i); 48 | if (i == 2) throw new("a block error"); 49 | return $"{i}a"; 50 | }, parallel: 4).BlockDo(async a => { 51 | await 1.Seconds().Delay(); 52 | log.Information("{a}b", a); 53 | /*if (a == "20a") { 54 | log.Information("b is throws error now"); 55 | throw new("b error"); 56 | }*/ 57 | return $"{a}b"; 58 | }, parallel: 4) 59 | .NotNull() 60 | //.Batch(10) 61 | //.Select(g => g.Select(b => $"{b}c").ToArray()) 62 | .BlockDo(async s => { 63 | await 1.Seconds().Delay(); 64 | log.Information("Batch {Res}", s); 65 | }); 66 | } 67 | 68 | class Item { 69 | public string Id { get; set; } 70 | public int Num { get; set; } 71 | } 72 | } -------------------------------------------------------------------------------- /App/Tests/DbTests.cs: -------------------------------------------------------------------------------- 1 | using Autofac; 2 | using NUnit.Framework; 3 | using YtReader.Db; 4 | 5 | namespace Tests; 6 | 7 | public class DbTests { 8 | record VideoResult(string video_id); 9 | 10 | [Test] 11 | public static async Task TestSfQuery() { 12 | using var ctx = await TestSetup.TextCtx(); 13 | var conn = ctx.Scope.Resolve(); 14 | using var db = await conn.Open(ctx.Log); 15 | var res = await db.Query("test", "select video_id from video_latest limit 10"); 16 | } 17 | } -------------------------------------------------------------------------------- /App/Tests/FormattingTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using NUnit.Framework; 3 | 4 | namespace Tests; 5 | 6 | public static class FormattingTests { 7 | [Test] 8 | public static void TestTimestampHumanise() { 9 | 120.Seconds().HumanizeShort().Should().Be("2m 0s"); 10 | 0.Seconds().HumanizeShort().Should().Be("0s"); 11 | 1.6.Seconds().HumanizeShort().Should().Be("1.6s"); 12 | 0.12.Seconds().HumanizeShort().Should().Be("120ms"); 13 | new TimeSpan(days: 1, hours: 2, minutes: 3, seconds: 4).HumanizeShort().Should().Be("1d 2h"); 14 | } 15 | } -------------------------------------------------------------------------------- /App/Tests/GlobalUsings.cs: -------------------------------------------------------------------------------- 1 | global using System; 2 | global using SysExtensions; 3 | global using SysExtensions.Collections; 4 | global using SysExtensions.Net; 5 | global using SysExtensions.Serialization; 6 | global using SysExtensions.Text; 7 | global using SysExtensions.Threading; 8 | global using System.Linq; 9 | global using System.Net.Http; 10 | global using Serilog; 11 | global using System.Text.RegularExpressions; 12 | global using Humanizer; 13 | global using System.Threading.Tasks; 14 | global using System.Collections.Generic; 15 | global using System.Threading; -------------------------------------------------------------------------------- /App/Tests/SerializationTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using Mutuo.Etl.Blob; 3 | using Newtonsoft.Json; 4 | using NUnit.Framework; 5 | 6 | namespace Tests; 7 | 8 | public class SerializationTests { 9 | [Test] 10 | public void TestSerializeRecord() { 11 | var file = new FileListItem("my/path", new DateTimeOffset(year: 2020, month: 05, day: 04, hour: 10, minute: 2, second: 0, millisecond: 0, TimeSpan.Zero), 12 | Bytes: 500); 13 | file.ToJson(JsonExtensions.DefaultSettings(Formatting.None)).Should() 14 | .Be("{\"path\":\"my/path\",\"modified\":\"2020-05-04T10:02:00+00:00\",\"bytes\":500}"); 15 | file.ToJson().ToObject().Should().BeEquivalentTo(file); 16 | } 17 | } -------------------------------------------------------------------------------- /App/Tests/TestSetup.cs: -------------------------------------------------------------------------------- 1 | using Autofac; 2 | using Nito.AsyncEx; 3 | using NUnit.Framework; 4 | using YtReader; 5 | 6 | namespace Tests; 7 | 8 | public record TestCtx(ILifetimeScope Scope, ILogger Log, AppCfg App, RootCfg Root) : IDisposable { 9 | public void Dispose() { 10 | if (Log is IDisposable d) d.Dispose(); 11 | Scope?.Dispose(); 12 | } 13 | 14 | public T Resolve() => Scope.Resolve(); 15 | } 16 | 17 | public static class TestSetup { 18 | static readonly AsyncLazy Ctx = new(async () => { 19 | var (cfg, rootCfg, version) = await Setup.LoadCfg(basePath: Setup.SolutionDir.Combine("YtCli").FullPath); 20 | var log = Setup.CreateLogger(rootCfg.Env, "Recfluence.Tests", version, cfg); 21 | Log.Logger = log; 22 | var appCtx = Setup.PipeAppCtxEmptyScope(rootCfg, cfg, version.Version); 23 | return new(Setup.MainScope(rootCfg, cfg, appCtx, version, log), log, cfg, rootCfg); 24 | }); 25 | 26 | public static async Task TextCtx() { 27 | var ctx = await Ctx; 28 | ctx.Log.Information("Starting Test - {TestName}", TestContext.CurrentContext.Test.Name); 29 | return ctx; 30 | } 31 | } -------------------------------------------------------------------------------- /App/Tests/Tests.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | net6.0 4 | false 5 | 10 6 | Library 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /App/YtCli/PipeCmd.cs: -------------------------------------------------------------------------------- 1 | using System.Threading.Tasks; 2 | using CliFx; 3 | using CliFx.Attributes; 4 | using CliFx.Exceptions; 5 | using CliFx.Infrastructure; 6 | using Mutuo.Etl.Pipe; 7 | using Serilog; 8 | using SysExtensions.Text; 9 | 10 | namespace YtCli; 11 | 12 | /// Generic command for pipe ETL to launch instances to perform any pipe operations 13 | [Command("pipe", Description = "Used internally to spawn containers running partitions of work")] 14 | public class PipeCmd : PipeCmdArgs, ICommand { 15 | readonly IPipeCtx PipeCtx; 16 | readonly ILogger Log; 17 | 18 | public PipeCmd(IPipeCtx pipeCtx, ILogger log) { 19 | PipeCtx = pipeCtx; 20 | Log = log; 21 | } 22 | 23 | public override async ValueTask ExecuteAsync(IConsole console) { 24 | var pipeMethods = PipeCtx.PipeMethods(); 25 | var runId = RunId.HasValue() ? PipeRunId.FromString(RunId) : new(); 26 | if (RunId.NullOrEmpty()) throw new CommandException($"Provide one of the following pipes to run: {pipeMethods.Join(", ", m => m.Method.Name)}"); 27 | if (!pipeMethods.ContainsKey(runId.Name)) 28 | throw new CommandException($"Pipe {runId.Name} not found. Available: {pipeMethods.Join(", ", m => m.Method.Name)}"); 29 | 30 | var cancel = console.RegisterCancellationHandler(); 31 | var log = Log.ForContext("RunId", runId); 32 | log.Information("Pipe Run Command Started {RunId}", RunId); 33 | if (runId.HasGroup) { 34 | await PipeCtx.DoPipeWork(runId, cancel); 35 | } 36 | else { 37 | var res = await PipeCtx.Run(runId.Name, new() { Location = Location ?? PipeRunLocation.Local }, log: log, cancel: cancel); 38 | if (res.Error) 39 | throw new CommandException(res.ErrorMessage); 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /App/YtCli/Program.cs: -------------------------------------------------------------------------------- 1 | using Autofac; 2 | using CliFx; 3 | using SysExtensions.Text; 4 | using YtCli; 5 | using YtReader; 6 | 7 | var (cfg, root, version) = await Setup.LoadCfg(rootLogger: Setup.ConsoleLogger()); 8 | var ytAssembly = typeof(RecExportCmd).Assembly; 9 | using var log = Setup.CreateLogger(root.Env, "Recfluence", version, cfg); 10 | using var scope = Setup.MainScope(root, cfg, Setup.PipeAppCtxEmptyScope(root, cfg, version.Version), version, log, args); 11 | using var cmdScope = scope.BeginLifetimeScope(c => { 12 | c.RegisterAssemblyTypes(typeof(ChannelInfoCmd).Assembly, ytAssembly) 13 | .AssignableTo(); 14 | }); 15 | var app = new CliApplicationBuilder() 16 | .AddCommandsFromThisAssembly() 17 | .AddCommandsFrom(ytAssembly) 18 | .UseTypeActivator(t => cmdScope.Resolve(t)) 19 | .SetTitle("Recfluence") 20 | .SetVersion(version.Version.ToString()) 21 | .Build(); 22 | log.Information("Starting cmd (recfluence {Args}) {Env} {Version}", args.Join(" "), root.Env, version.Version); 23 | var res = await app.RunAsync(args); 24 | log.Information("Completed cmd (recfluence {Args}) {Env} {Version}", args.Join(" "), root.Env, version.Version); 25 | return res; -------------------------------------------------------------------------------- /App/YtCli/YtCli.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | Exe 4 | net6.0 5 | 7 | recfluence 8 | 10 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | Always 22 | 23 | 24 | Always 25 | 26 | 27 | Always 28 | 29 | 30 | PreserveNewest 31 | 32 | 33 | 34 | 35 | 36 | Always 37 | 38 | 39 | 40 | Always 41 | 42 | 43 | -------------------------------------------------------------------------------- /App/YtCli/default.appcfg.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/markledwich2/Recfluence/master/App/YtReader/AppCfg.schema.json", 3 | "seq": { 4 | "seqUrl": "http://seq.recfluence.net", 5 | "containerGroupName": "seq" 6 | }, 7 | "snowflake": { 8 | "host": "ql98184.west-us-2.azure.snowflakecomputing.com", 9 | "warehouse": "yt", 10 | "schema": "public", 11 | "db": "yt" 12 | }, 13 | "collect": { 14 | "discoverChannels": 1 15 | }, 16 | "pipe": { 17 | "azure": { 18 | "subscriptionId": "93ef8661-3e11-4fcc-92ae-d0383e7e4ad3", 19 | "servicePrincipal": { 20 | "clientId": "664f7796-ecbf-4244-ba6b-b4d76e1c1037", 21 | "tennantId": "6084d3dc-3b42-4303-8568-314a3db26017" 22 | }, 23 | "resourceGroup": "ytnetworks" 24 | }, 25 | "default": { 26 | "container": { 27 | "registry": "ytnetworks.azurecr.io", 28 | "imageName": "recfluence", 29 | "cores": 2, 30 | "mem": 4, 31 | "exe": "./recfluence" 32 | } 33 | }, 34 | "pipes": [ 35 | { 36 | "pipeName": "Update", 37 | "container": { 38 | "mem": 3, 39 | "cores": 2 40 | } 41 | }, 42 | { 43 | "pipeName": "ProcessChannels", 44 | "container": { 45 | "mem": 2, 46 | "cores": 1 47 | }, 48 | "maxParallel": 22, 49 | "minWorkItems": 300 50 | }, 51 | { 52 | "pipeName": "ProcessVideos", 53 | "container": { 54 | "mem": 2, 55 | "cores": 2 56 | }, 57 | "maxParallel": 8, 58 | "minWorkItems": 2000 59 | }, 60 | { 61 | "pipeName": "CollectUserChannels", 62 | "container": { 63 | "cores": 1, 64 | "mem": 1 65 | }, 66 | "maxParallel": 12, 67 | "minWorkItems": 2000 68 | }, 69 | { 70 | "pipeName": "DataScripts", 71 | "container": { 72 | "mem": 4, 73 | "cores": 4 74 | } 75 | } 76 | ] 77 | }, 78 | "dataScripts": { 79 | "stale": "2021-03-26T04:39:34Z" 80 | }, 81 | "aws": { 82 | "region": "us-west-2", 83 | "s3": { 84 | "bucket": "pendulum-lake" 85 | } 86 | } 87 | } -------------------------------------------------------------------------------- /App/YtCli/dev.appcfg.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "backupRootPath": "backuptest" 4 | }, 5 | "collect": { 6 | "discoverChannels": 0 7 | }, 8 | "pipe": { 9 | "location": "Local" 10 | } 11 | } -------------------------------------------------------------------------------- /App/YtCli/prod.appcfg.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "backupRootPath": "backup" 4 | }, 5 | "pipe": { 6 | "location": "Container" 7 | } 8 | } -------------------------------------------------------------------------------- /App/YtFunctions/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-dotnettools.csharp" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /App/YtFunctions/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach to .NET Functions", 6 | "type": "coreclr", 7 | "request": "attach", 8 | "processId": "${command:azureFunctions.pickProcess}" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /App/YtFunctions/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "Jsonl", 4 | "Mutuo", 5 | "Serilog" 6 | ], 7 | "azureFunctions.deploySubpath": "bin/Release/net6.0/publish", 8 | "azureFunctions.projectLanguage": "C#", 9 | "azureFunctions.projectRuntime": "~4", 10 | "debug.internalConsoleOptions": "neverOpen", 11 | "azureFunctions.preDeployTask": "publish" 12 | } -------------------------------------------------------------------------------- /App/YtFunctions/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "publush", 6 | "command": "dotnet", 7 | "args": [ 8 | "publish", 9 | "-c", 10 | "Release" 11 | ], 12 | "problemMatcher": [ 13 | "$msCompile" 14 | ] 15 | }, 16 | { 17 | "label": "clean", 18 | "command": "dotnet", 19 | "args": [ 20 | "clean", 21 | "/property:GenerateFullPaths=true", 22 | "/consoleloggerparameters:NoSummary" 23 | ], 24 | "type": "process", 25 | "problemMatcher": "$msCompile" 26 | }, 27 | { 28 | "label": "build", 29 | "command": "dotnet", 30 | "args": [ 31 | "build", 32 | "/property:GenerateFullPaths=true", 33 | "/consoleloggerparameters:NoSummary" 34 | ], 35 | "type": "process", 36 | "dependsOn": "clean", 37 | "group": { 38 | "kind": "build", 39 | "isDefault": true 40 | }, 41 | "problemMatcher": "$msCompile" 42 | }, 43 | { 44 | "label": "clean release", 45 | "command": "dotnet", 46 | "args": [ 47 | "clean", 48 | "--configuration", 49 | "Release", 50 | "/property:GenerateFullPaths=true", 51 | "/consoleloggerparameters:NoSummary" 52 | ], 53 | "type": "process", 54 | "problemMatcher": "$msCompile" 55 | }, 56 | { 57 | "label": "publish", 58 | "command": "dotnet", 59 | "args": [ 60 | "publish", 61 | "--configuration", 62 | "Release", 63 | "/property:GenerateFullPaths=true", 64 | "/consoleloggerparameters:NoSummary" 65 | ], 66 | "type": "process", 67 | "dependsOn": "clean release", 68 | "problemMatcher": "$msCompile" 69 | }, 70 | { 71 | "type": "func", 72 | "dependsOn": "build", 73 | "options": { 74 | "cwd": "${workspaceFolder}/bin/Debug/net6.0" 75 | }, 76 | "command": "host start --dotnet-isolated-debug", 77 | "isBackground": true, 78 | "problemMatcher": "$func-dotnet-watch" 79 | } 80 | ] 81 | } -------------------------------------------------------------------------------- /App/YtFunctions/ApiBackend.cs: -------------------------------------------------------------------------------- 1 | using System.Threading.Tasks; 2 | using Microsoft.Azure.Functions.Worker; 3 | using Microsoft.Azure.Functions.Worker.Http; 4 | using Mutuo.Etl.AzureManagement; 5 | using Mutuo.Etl.Pipe; 6 | using Semver; 7 | using Serilog; 8 | using SysExtensions.Text; 9 | using YtReader; 10 | using static YtFunctions.HttpResponseEx; 11 | using IMSLogger = Microsoft.Extensions.Logging.ILogger; 12 | 13 | namespace YtFunctions; 14 | 15 | public record ApiBackend(SemVersion Version, IPipeCtx Ctx, ILogger Log, ContainerCfg ContainerCfg, YtContainerRunner Runner, AzureCleaner AzCleaner) { 16 | [Function(nameof(DeleteExpiredResources_Timer))] 17 | public Task DeleteExpiredResources_Timer([TimerTrigger("0 0 * * * *")] TimerInfo myTimer) => 18 | F(() => AzCleaner.DeleteExpiredResources(CleanContainerMode.Standard, Log)); 19 | 20 | [Function("Version")] 21 | public HttpResponseData GetVersion([HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req) => req.TextResponse(Version.ToString()); 22 | 23 | [Function("Update_Timer")] public Task Update_Timer([TimerTrigger("0 0 0 * * SAT")] TimerInfo timer) => F(RunUpdate); 24 | 25 | [Function("Update")] 26 | public Task Update([HttpTrigger(AuthorizationLevel.Function, "get", "post")] HttpRequestData req) => R(async () => { 27 | var container = await RunUpdate(); 28 | return req.TextResponse($"Update - started container '{container}'"); 29 | }); 30 | 31 | async Task RunUpdate() { 32 | var groupName = $"update{(Version.Prerelease.HasValue() ? $"-{Version.Prerelease}" : "")}"; 33 | await Runner.Run(groupName, returnOnStart: true, args: new[] { "update" }); 34 | return groupName; 35 | } 36 | } -------------------------------------------------------------------------------- /App/YtFunctions/Create.azcli: -------------------------------------------------------------------------------- 1 | 2 | az login 3 | az configure --defaults location=westus2 group=ytnetworks 4 | az storage account create --name recfluencefunc --sku Standard_LRS 5 | az functionapp create --consumption-plan-location westus2 --runtime dotnet-isolated --runtime-version 6.0 --functions-version 4 --name recfluence --storage-account recfluencefunc 6 | az functionapp cors show --name recfluence 7 | az functionapp cors remove -n recfluence --allowed-origins 8 | az functionapp cors add -n recfluence --allowed-origins * 9 | # once func is created, use azur devops to deploy to it -------------------------------------------------------------------------------- /App/YtFunctions/HttpResponseEx.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Net; 3 | using System.Threading.Tasks; 4 | using Microsoft.Azure.Functions.Worker.Http; 5 | using Newtonsoft.Json; 6 | using Serilog; 7 | using SysExtensions; 8 | using SysExtensions.Serialization; 9 | using static System.Net.HttpStatusCode; 10 | 11 | namespace YtFunctions; 12 | 13 | public static class HttpResponseEx { 14 | public static HttpResponseData JsonResponse(this HttpRequestData req, object data, HttpStatusCode status = OK, 15 | JsonSerializerSettings settings = null) => 16 | req.JsonResponse(data.ToJson(settings), status); 17 | 18 | public static HttpResponseData JsonResponse(this HttpRequestData req, string json, HttpStatusCode status = OK) { 19 | var res = req.CreateResponse(status).WithJsonContentHeaders(); 20 | res.WriteString(json); 21 | return res; 22 | } 23 | 24 | public static HttpResponseData TextResponse(this HttpRequestData req, string data, HttpStatusCode status = OK) { 25 | var res = req.CreateResponse(status); 26 | res.Headers.Add("Content-Type", "text/plain; charset=utf-8"); 27 | res.WriteString(data); 28 | return res; 29 | } 30 | 31 | public static HttpResponseData WithJsonContentHeaders(this HttpResponseData res) { 32 | res.Headers.Add("Content-Type", "application/json;charset=utf8"); 33 | return res; 34 | } 35 | 36 | public static async Task F(Func run) => 37 | await run().WithOnError(ex => Log.Error(ex, "Func failed: {Message}", ex.Message)); 38 | 39 | public static async Task R(Func> run) => 40 | await run().WithOnError(ex => Log.Error(ex, "Func failed: {Message}", ex.Message)); 41 | } -------------------------------------------------------------------------------- /App/YtFunctions/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Autofac.Extensions.DependencyInjection; 3 | using Microsoft.Extensions.Configuration; 4 | using Microsoft.Extensions.Hosting; 5 | using Serilog; 6 | using YtReader; 7 | 8 | var cfgDir = Setup.SolutionDir?.Combine("YtCli").FullPath ?? Environment.CurrentDirectory; 9 | var (cfg, root, version) = await Setup.LoadCfg(cfgDir, Setup.ConsoleLogger()); 10 | using var log = Setup.CreateLogger(root.Env, "Recfluence", version, cfg); 11 | 12 | // bellow is what we need to do to run pipes from functions. But we don't need to right now 13 | // we pre-build the the scope so we can provide a PipeAppCtx with a scope that's ready to resolve. 14 | // var appCtx = Setup.PipeAppCtxEmptyScope(root, cfg, version.Version); 15 | // using var mainScope = Setup.MainScope(root, cfg, appCtx, version, log, args); 16 | try { 17 | var host = new HostBuilder() 18 | .UseServiceProviderFactory(new AutofacServiceProviderFactory(c => { 19 | c.ConfigureScope(root, cfg, Setup.PipeAppCtxEmptyScope(root, cfg, version.Version), version, log, args); 20 | })) 21 | .ConfigureAppConfiguration(c => c.AddCommandLine(args)) 22 | .ConfigureFunctionsWorkerDefaults() 23 | .UseSerilog(log) 24 | .Build(); 25 | await host.RunAsync(); 26 | } 27 | catch (Exception ex) { 28 | log.Fatal(ex, "Unhandled error in YtFunction: {Message}", ex.Message); 29 | } -------------------------------------------------------------------------------- /App/YtFunctions/YtFunctions.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | net6.0 4 | v4 5 | Exe 6 | 10 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | PreserveNewest 24 | 25 | 26 | Always 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | default.appcfg.json 36 | PreserveNewest 37 | 38 | 39 | dev.appcfg.json 40 | PreserveNewest 41 | 42 | 43 | prod.appcfg.json 44 | PreserveNewest 45 | 46 | 47 | -------------------------------------------------------------------------------- /App/YtFunctions/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0" 3 | } -------------------------------------------------------------------------------- /App/YtReader/Db/YtMartModel.cs: -------------------------------------------------------------------------------- 1 | namespace YtReader.Db; 2 | /* These are out of date. Update before using again */ 3 | 4 | public class DbVideo { 5 | public string VIDEO_ID { get; set; } 6 | public string VIDEO_TITLE { get; set; } 7 | public string CHANNEL_ID { get; set; } 8 | public string CHANNEL_TITLE { get; set; } 9 | public DateTime UPLOAD_DATE { get; set; } 10 | public long VIEWS { get; set; } 11 | public string THUMB_HIGH { get; set; } 12 | public long LIKES { get; set; } 13 | public long DISLIKES { get; set; } 14 | public DateTime DURATION { get; set; } 15 | public string DESCRIPTION { get; set; } 16 | public double PCT_ADS { get; set; } 17 | public DateTime UPDATED { get; set; } 18 | } 19 | 20 | public class DbCaption { 21 | public string CAPTION_ID { get; set; } 22 | public string VIDEO_ID { get; set; } 23 | public string CHANNEL_ID { get; set; } 24 | public string CAPTION { get; set; } 25 | public long OFFSET_SECONDS { get; set; } 26 | public DateTime UPDATED { get; set; } 27 | } 28 | 29 | public class DbChannel { 30 | public string CHANNEL_ID { get; set; } 31 | public string CHANNEL_TITLE { get; set; } 32 | public string MAIN_CHANNEL_ID { get; set; } 33 | public string CHANNEL_DECRIPTION { get; set; } 34 | public string LOGO_URL { get; set; } 35 | public string LR { get; set; } 36 | public long SUBS { get; set; } 37 | public long CHANNEL_VIEWS { get; set; } 38 | public string COUNTRY { get; set; } 39 | public string TAGS { get; set; } 40 | public DateTime UPDATED { get; set; } 41 | public long CHANNEL_VIDEO_VIEWS { get; set; } 42 | public DateTime FROM_DATE { get; set; } 43 | public DateTime TO_DATE { get; set; } 44 | public decimal CHANNEL_LIFETIME_DAILY_VIEWS { get; set; } 45 | public double CHANNEL_LIFETIME_DAILY_VIEWS_RELEVANT { get; set; } 46 | public string MAIN_CHANNEL_TITLE { get; set; } 47 | public string IDEOLOGY { get; set; } 48 | public string MEDIA { get; set; } 49 | public string MANOEL { get; set; } 50 | public string AIN { get; set; } 51 | } -------------------------------------------------------------------------------- /App/YtReader/GlobalUsings.cs: -------------------------------------------------------------------------------- 1 | global using System; 2 | global using SysExtensions; 3 | global using SysExtensions.Collections; 4 | global using SysExtensions.Net; 5 | global using SysExtensions.Serialization; 6 | global using SysExtensions.Text; 7 | global using SysExtensions.Threading; 8 | global using System.Linq; 9 | global using System.Net.Http; 10 | global using Serilog; 11 | global using System.Text.RegularExpressions; 12 | global using Humanizer; 13 | global using System.Threading.Tasks; 14 | global using System.Collections.Generic; 15 | global using System.Threading; -------------------------------------------------------------------------------- /App/YtReader/ResourceCycle.cs: -------------------------------------------------------------------------------- 1 | using Nito.AsyncEx; 2 | 3 | namespace YtReader; 4 | 5 | class ResourceCycle : IAsyncDisposable 6 | where TCfg : class 7 | where T : class { 8 | readonly Func> Create; 9 | readonly TCfg[] _configs; 10 | (T Resource, TCfg Cfg)? _current; 11 | readonly SemaphoreSlim _lock = new SemaphoreSlim(initialCount: 1, maxCount: 1); 12 | 13 | public ResourceCycle(TCfg[] cfg, Func> create, int index = 0) { 14 | Create = create; 15 | _configs = cfg; 16 | Idx = index; 17 | } 18 | 19 | public int Idx { get; private set; } 20 | 21 | public async Task<(T Resource, TCfg Cfg)> Get() { 22 | var c = _current; 23 | if (c != null) return c.Value; 24 | return await NextResource(null).ConfigureAwait(false); 25 | } 26 | 27 | /// Will cycle to the next resource if the current one matches the cfg given (reference equality) 28 | /// 29 | /// 30 | public async Task<(T Resource, TCfg Cfg)> NextResource(T Resource) { 31 | using var l = await _lock.LockAsync(); 32 | 33 | if (_current.HasValue && _current.Value.Resource == Resource) { 34 | Idx = (Idx + 1) % _configs.Length; 35 | if (_current.Value.Resource is IAsyncDisposable d) 36 | await d.DisposeAsync().ConfigureAwait(false); 37 | _current = null; 38 | } 39 | 40 | if (_current != null) return _current.Value; 41 | 42 | var cfg = _configs[Idx]; 43 | _current = (await Create(cfg).ConfigureAwait(false), cfg); 44 | return _current.Value; 45 | } 46 | 47 | public async ValueTask DisposeAsync() { 48 | var r = _current?.Resource; 49 | if (r == null) return; 50 | if (r is IAsyncDisposable a) 51 | await a.DisposeAsync().ConfigureAwait(false); 52 | else if (_current?.Resource is IDisposable d) 53 | d.Dispose(); 54 | } 55 | } -------------------------------------------------------------------------------- /App/YtReader/RootCfg.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "appStoreCs": { 5 | "type": "string" 6 | }, 7 | "env": { 8 | "type": "string" 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /App/YtReader/Store/readme.md: -------------------------------------------------------------------------------- 1 | ## How data is stored in blob storage ## 2 | 3 | Example un-partitioned land & stage store 4 | 5 | ``` 6 | search 7 | landing 8 | 2020-01-01_14_20_10.jsonl.gz <- small files 9 | 2020-01-01_14_20_13.jsonl.gz 10 | stage <- (up too 50MB) files 11 | 2020-01-01_14_20_13.jsonl.gz 12 | ``` 13 | 14 | Example partitions land & stage store 15 | 16 | ``` 17 | captions 18 | landing 19 | ch1 20 | 2020-01-01_14_20_10.jsonl.gz <- small files 21 | 2020-01-01_14_20_13.jsonl.gz 22 | ch2 23 | 2020-01-01_14_20_10.jsonl.gz 24 | 2020-01-01_14_20_13.jsonl.gz 25 | 26 | stage <- (up too 100MB) files 27 | ch1 28 | 2020-01-01_14_20_16.jsonl.gz <- grouped files within partition 29 | ch2 30 | 2020-01-01_14_20_16.jsonl.gz 31 | ``` 32 | 33 | Example parititioned store 34 | 35 | ``` 36 | ch1 37 | 2020-01-01_14_20_10.jsonl.gz 38 | 2020-01-01_14_20_13.jsonl.gz 39 | ch2 40 | 2020-01-01_14_20_10.jsonl.gz 41 | 2020-01-01_14_20_13.jsonl.gz 42 | ``` 43 | 44 | 45 | -------------------------------------------------------------------------------- /App/YtReader/VersionInfo.cs: -------------------------------------------------------------------------------- 1 | using Semver; 2 | using SysExtensions.Build; 3 | 4 | namespace YtReader; 5 | 6 | public class VersionInfo { 7 | public VersionInfo(SemVersion version, GitVersionInfo info) { 8 | Version = version; 9 | Info = info; 10 | } 11 | 12 | public SemVersion Version { get; set; } 13 | public GitVersionInfo Info { get; set; } 14 | public SemVersion ProdVersion => Version.Change(prerelease: ""); 15 | } 16 | 17 | public class VersionInfoProvider { 18 | readonly Defer LazyVersion; 19 | 20 | public VersionInfoProvider(ILogger log, RootCfg rootCfg) => 21 | LazyVersion = new(async () => { 22 | var (version, info) = await GitVersionInfo.DiscoverVersion(typeof(VersionInfo), log); 23 | var prefix = GetVersionPrefix(rootCfg, version, info); 24 | version = version.Change(prerelease: prefix); 25 | return new(version, info); 26 | }); 27 | 28 | static readonly Regex NonAlphaNum = new("[^a-zA-Z0-9]", RegexOptions.Compiled); 29 | 30 | public static string GetVersionPrefix(RootCfg rootCfg, SemVersion version, GitVersionInfo info = null) { 31 | if (rootCfg.IsProd()) return rootCfg.BranchEnv ?? ""; 32 | var prerelease = version.Prerelease.HasValue() ? version.Prerelease : null; 33 | var prefix = rootCfg.BranchEnv ?? info?.BranchName ?? prerelease ?? rootCfg.Env.ToLower(); 34 | prefix = NonAlphaNum.Replace(prefix, ""); 35 | return prefix; 36 | } 37 | 38 | public Task Version() => LazyVersion.GetOrCreate(); 39 | } -------------------------------------------------------------------------------- /App/YtReader/Web/AngleExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Net; 2 | using AngleSharp; 3 | using AngleSharp.Dom; 4 | 5 | namespace YtReader.Web; 6 | 7 | public static class AngleExtensions { 8 | public static T El(this IParentNode b, string selector) where T : class, IElement => b.QuerySelector(selector) as T; 9 | public static IElement El(this IParentNode b, string selector) => b.QuerySelector(selector); 10 | 11 | public static IEnumerable Els(this IParentNode b, string selector) where T : class, IElement => b.QuerySelectorAll(selector).Cast(); 12 | public static IEnumerable Els(this IParentNode b, string selector) => b.QuerySelectorAll(selector); 13 | 14 | public static string QsAttr(this IParentNode b, string selector, string attribute) => b.QuerySelector(selector)?.GetAttribute(attribute); 15 | public static void EnsureSuccess(this IDocument doc) => doc.StatusCode.EnsureSuccess(doc.Url); 16 | 17 | public static IConfiguration WithProxyRequester(this IConfiguration angleCfg, FlurlProxyClient proxyClient 18 | , ProxyType proxyType = default) { 19 | var proxy = proxyClient.UseProxy ? proxyClient.Cfg.Proxy(proxyType)?.CreateWebProxy() : null; 20 | var handler = new HttpClientHandler { 21 | Proxy = proxy, 22 | PreAuthenticate = true, 23 | UseDefaultCredentials = false, 24 | UseCookies = true, 25 | UseProxy = true, 26 | AutomaticDecompression = DecompressionMethods.All 27 | }; 28 | 29 | /*var requester = new DefaultHttpRequester("PostmanRuntime/7.26.10", request => { 30 | var proxy = proxyClient.UseProxy ? proxyClient.Cfg.Proxy(proxyType)?.CreateWebProxy() : null; 31 | if (proxy != null) request.Proxy = proxy; 32 | log?.Debug("Angle Request {Proxy}: {Curl}", proxy?.Address?.ToString() ?? "(direct)", request.FormatCurl()); 33 | });*/ 34 | /*if (headers != null) 35 | requester.Headers.AddRange(headers); 36 | if (timeout != default) 37 | requester.Timeout = timeout;*/ 38 | return angleCfg.WithRequesters(handler); 39 | } 40 | 41 | /// Configures the angle requester from the given flurl proxy client configuration (doesn't actually use flurl 42 | /// client at run time) 43 | public static IBrowsingContext Browser(this IConfiguration angleCfg) => BrowsingContext.New(angleCfg); 44 | } -------------------------------------------------------------------------------- /App/YtReader/Web/WebEx.cs: -------------------------------------------------------------------------------- 1 | using System.Net; 2 | using Flurl; 3 | using Mutuo.Etl.Blob; 4 | 5 | namespace YtReader.Web; 6 | 7 | public static class WebEx { 8 | public static string LastInPath(this string path) => path?.Split('/').LastOrDefault(t => !t.NullOrEmpty()); 9 | 10 | /// Removes leading and trailing slashes from the path 11 | public static string TrimPath(this string path) => path?.Split('/').Where(t => !t.Trim().NullOrEmpty()).Join("/"); 12 | 13 | public static ProxyConnectionCfg Proxy(this ProxyCfg cfg, ProxyType type) => cfg.Proxies.FirstOrDefault(c => c.Type == type); 14 | 15 | public static WebProxy CreateWebProxy(this ProxyConnectionCfg proxy) => 16 | new(proxy.Url, BypassOnLocal: true, new string[] { }, 17 | proxy.Creds != null ? new NetworkCredential(proxy.Creds.Name, proxy.Creds.Secret) : null); 18 | 19 | public static HttpClient CreateHttpClient(this ProxyConnectionCfg proxy, TimeSpan? timeout = null) => 20 | new(new HttpClientHandler { 21 | AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, 22 | UseCookies = false, 23 | Proxy = proxy.Url == null ? null : proxy.CreateWebProxy(), 24 | UseProxy = proxy.Url != null 25 | }) { 26 | Timeout = timeout ?? 30.Seconds() 27 | }; 28 | 29 | public static async Task LogParseError(this ISimpleFileStore logStore, string msg, Exception ex, Url url, string content, string fileType, ILogger log) { 30 | if (content == null) { 31 | log.Debug(ex, "Parsing Error {Url} - No content: {Msg}", url, msg); 32 | return; 33 | } 34 | var path = new SPath($"{DateTime.UtcNow:yyyy-MM-dd}/{url.Host}/{url.Path.TrimPath()}.{fileType}"); 35 | await logStore.Save(path, content.AsStream(), log); 36 | log.Debug(ex, "Parsing Error {Url} - Saved content to {LogUrl}: {Msg}", url, logStore.Url(path), msg); 37 | } 38 | } -------------------------------------------------------------------------------- /App/YtReader/Web/YtGtEx.cs: -------------------------------------------------------------------------------- 1 | using Flurl; 2 | using LtGt; 3 | 4 | namespace YtReader.Web; 5 | 6 | public static class YtGtEx { 7 | public static HtmlElement El(this HtmlContainer e, string selector) => e.QueryElements(selector).FirstOrDefault(); 8 | public static IEnumerable Els(this HtmlContainer e, string selector) => e.QueryElements(selector); 9 | public static string Str(this HtmlElement e, string attributeName) => e.GetAttributeValue(attributeName); 10 | public static string Str(this QueryParamCollection paramCol, string name) => paramCol.GetAll(name).FirstOrDefault()?.ToString(); 11 | public static string Txt(this HtmlElement e) => e.GetInnerText(); 12 | } -------------------------------------------------------------------------------- /App/YtReader/Yt/YtModel.cs: -------------------------------------------------------------------------------- 1 | // all of this only minor midifications to https://github.com/Tyrrrz/YoutubeExplode 2 | 3 | namespace YtReader.Yt; 4 | 5 | public record YtVideoItem { 6 | public string Id { get; init; } 7 | public DateTime? UploadDate { get; init; } 8 | public string Title { get; init; } 9 | public TimeSpan Duration { get; init; } 10 | public Statistics Statistics { get; init; } 11 | } 12 | 13 | /// User activity statistics. 14 | public record Statistics(ulong? ViewCount, ulong? LikeCount = null, ulong? DislikeCount = null) { 15 | public ulong? Rumbles { get; init; } 16 | public double? AverageRating { get; init; } 17 | public override string ToString() => $"{ViewCount} Views"; 18 | } 19 | 20 | /// Text that gets displayed at specific time during video playback, as part of a . 21 | public record CaptionLine(string Text, TimeSpan? Offset, TimeSpan? Duration, string Speaker = null) { 22 | public override string ToString() => 23 | $"{new[] { Offset.Dot(o => o.HumanizeShort()), Speaker }.NotNull().ToArray().Dot(d => d.Any() ? $"{d.Join(" ")}: " : "")}{Text}"; 24 | } 25 | 26 | /// Set of captions that get displayed during video playback. 27 | public record CaptionTrack(CaptionTrackInfo Info, IReadOnlyList Captions); 28 | 29 | /// Metadata associated with a certain . 30 | public record CaptionTrackInfo(string Url, CaptionLang Language, bool IsAutoGenerated) { 31 | public bool Default { get; init; } 32 | public override string ToString() => $"{Language}"; 33 | } 34 | 35 | /// Language information. 36 | public record CaptionLang(string Code, string Name) { 37 | public override string ToString() => $"{Code} ({Name})"; 38 | } -------------------------------------------------------------------------------- /App/YtReader/YtBackup.cs: -------------------------------------------------------------------------------- 1 | using System.Diagnostics; 2 | using Microsoft.Azure.Storage.Blob; 3 | using Microsoft.Azure.Storage.DataMovement; 4 | using Mutuo.Etl.Blob; 5 | using YtReader.Store; 6 | 7 | namespace YtReader; 8 | 9 | public class YtBackup { 10 | readonly BlobStores Stores; 11 | 12 | public YtBackup(BlobStores stores) => Stores = stores; 13 | 14 | public static async Task CopyBlobs(string opName, CloudBlobDirectory sourceBlob, CloudBlobDirectory destBlob, ILogger log) { 15 | var destUrl = destBlob.Uri; 16 | var sw = Stopwatch.StartNew(); 17 | var logInterval = 5.Seconds(); 18 | var context = new DirectoryTransferContext { 19 | ProgressHandler = new Progress(p => { 20 | if (sw.Elapsed < logInterval) return; 21 | sw.Restart(); 22 | log.Debug("{OpName} {Url} - {Size} copied: {Files} files {Skipped} skipped {Failed} failed", 23 | opName, destUrl, p.BytesTransferred.Bytes().Humanize("#,#.#"), p.NumberOfFilesTransferred, p.NumberOfFilesSkipped, p.NumberOfFilesFailed); 24 | }) 25 | }; 26 | 27 | var (res, dur) = await TransferManager.CopyDirectoryAsync(sourceBlob, destBlob, 28 | CopyMethod.ServiceSideSyncCopy, new() { Recursive = true }, 29 | context, CancellationToken.None).WithDuration(); 30 | 31 | if (res.NumberOfFilesFailed > 0) 32 | log.Error("{OpName} {Url} - {Files} files failed to copy", 33 | opName, destUrl, res.NumberOfFilesFailed); 34 | log.Information("{OpName} {Url} - {Size} of {Files} files copied in {Duration}", 35 | opName, destUrl, res.BytesTransferred.Bytes().Humanize("#,#.#"), res.NumberOfFilesTransferred, dur.HumanizeShort()); 36 | } 37 | } -------------------------------------------------------------------------------- /App/YtReader/YtConvertWatchTimeFiles.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | using System.IO; 3 | using System.Text; 4 | using CsvHelper; 5 | using Mutuo.Etl.Blob; 6 | using YtReader.Store; 7 | 8 | namespace YtReader; 9 | 10 | public class YtConvertWatchTimeFiles { 11 | readonly ISimpleFileStore Store; 12 | 13 | public YtConvertWatchTimeFiles(BlobStores stores) => Store = stores.Store(DataStoreType.Root); 14 | 15 | public async Task Convert(ILogger log) { 16 | var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv"); 17 | await files.BlockDo(async f => { 18 | using var stream = await Store.Load(f.Path); 19 | using var sr = new StreamReader(stream); 20 | using var csv = new CsvReader(sr, new(CultureInfo.InvariantCulture) { 21 | Encoding = Encoding.UTF8, 22 | HasHeaderRecord = true, 23 | MissingFieldFound = null, 24 | BadDataFound = r => log.Warning("Error reading csv data at: {RowData}", r.RawRecord) 25 | }); 26 | var rows = await csv.GetRecordsAsync().ToListAsync(); 27 | await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log); 28 | }, parallel: 4); 29 | } 30 | } -------------------------------------------------------------------------------- /App/YtReader/YtDataform.cs: -------------------------------------------------------------------------------- 1 | using System.ComponentModel.DataAnnotations; 2 | using Mutuo.Etl.Pipe; 3 | using YtReader.Data; 4 | using YtReader.Db; 5 | 6 | namespace YtReader; 7 | 8 | public class DataformCfg { 9 | [Required] 10 | public ContainerCfg Container { get; set; } = new() { 11 | Cores = 1, 12 | Mem = 2, 13 | ImageName = "dataform", 14 | Exe = "" 15 | }; 16 | } 17 | 18 | public class YtDataform { 19 | readonly ContainerLauncher Containers; 20 | readonly DataformCfg Cfg; 21 | readonly SnowflakeCfg SfCfg; 22 | readonly SeqCfg SeqCfg; 23 | 24 | public YtDataform(ContainerLauncher containers, DataformCfg cfg, SnowflakeCfg sfCfg, SeqCfg seqCfg) { 25 | Containers = containers; 26 | Cfg = cfg; 27 | SfCfg = sfCfg; 28 | SeqCfg = seqCfg; 29 | } 30 | 31 | public async Task Update(ILogger log, bool fullLoad = false, string[] tables = null, string[] actions = null, bool includeDeps = false, 32 | CancellationToken cancel = default) { 33 | var sfCfg = SfCfg.JsonClone(); 34 | sfCfg.Db = sfCfg.DbName(); // serialize the environment specific db name 35 | 36 | var args = new[] { 37 | fullLoad ? " --full-refresh " : null, 38 | includeDeps ? "--include-deps" : null, 39 | tables?.Any() == true 40 | ? $"{tables.Join(" ", t => $"--actions {t.ToUpperInvariant()}")}" 41 | : (actions ?? new[] { "standard" }).Join(" ", a => $"--tags {a}") 42 | }.NotNull().ToArray(); 43 | 44 | var env = new (string name, string value)[] { 45 | ("SNOWFLAKE_JSON", sfCfg.ToJson()), 46 | ("REPO", "https://github.com/markledwich2/YouTubeNetworks_Dataform.git"), 47 | ("BRANCH", "master"), 48 | ("DATAFORM_RUN_ARGS", args.Join(" ")), 49 | ("SEQ", SeqCfg.SeqUrl.ToString()) 50 | }; 51 | 52 | log.Information("Dataform - launching container to update {Db}. dataform {Args}", sfCfg.Db, args); 53 | const string containerName = "dataform"; 54 | var fullName = Cfg.Container.FullContainerImageName("latest"); 55 | var dur = await Containers.RunContainer(containerName, fullName, env, log: log, cancel: cancel).WithDuration(); 56 | log.Information("Dataform - container completed in {Duration}", dur.HumanizeShort()); 57 | } 58 | } -------------------------------------------------------------------------------- /App/docker-run-recfluence.ps1: -------------------------------------------------------------------------------- 1 | $rootCfg = Get-content ./YtCli/local.rootcfg.json | ConvertFrom-Json 2 | $cs = $rootCfg.appStoreCs 3 | docker run -e env=dev -e appStoreCs="$cs" ytnetworks.azurecr.io/recfluence:latest ./recfluence test-chrome-scraper -v fJoAPMWk4zc -------------------------------------------------------------------------------- /App/omnisharp.json: -------------------------------------------------------------------------------- 1 | { 2 | "FormattingOptions": { 3 | "NewLine": "\n", 4 | "UseTabs": false, 5 | "TabSize": 4, 6 | "IndentationSize": 4, 7 | "SpacingAfterMethodDeclarationName": false, 8 | "SpaceWithinMethodDeclarationParenthesis": false, 9 | "SpaceBetweenEmptyMethodDeclarationParentheses": false, 10 | "SpaceAfterMethodCallName": false, 11 | "SpaceWithinMethodCallParentheses": false, 12 | "SpaceBetweenEmptyMethodCallParentheses": false, 13 | "SpaceAfterControlFlowStatementKeyword": true, 14 | "SpaceWithinExpressionParentheses": false, 15 | "SpaceWithinCastParentheses": false, 16 | "SpaceWithinOtherParentheses": false, 17 | "SpaceAfterCast": false, 18 | "SpacesIgnoreAroundVariableDeclaration": false, 19 | "SpaceBeforeOpenSquareBracket": false, 20 | "SpaceBetweenEmptySquareBrackets": false, 21 | "SpaceWithinSquareBrackets": false, 22 | "SpaceAfterColonInBaseTypeDeclaration": true, 23 | "SpaceAfterComma": true, 24 | "SpaceAfterDot": false, 25 | "SpaceAfterSemicolonsInForStatement": true, 26 | "SpaceBeforeColonInBaseTypeDeclaration": true, 27 | "SpaceBeforeComma": false, 28 | "SpaceBeforeDot": false, 29 | "SpaceBeforeSemicolonsInForStatement": false, 30 | "SpacingAroundBinaryOperator": "single", 31 | "IndentBraces": false, 32 | "IndentBlock": true, 33 | "IndentSwitchSection": true, 34 | "IndentSwitchCaseSection": true, 35 | "LabelPositioning": "oneLess", 36 | "WrappingPreserveSingleLine": true, 37 | "WrappingKeepStatementsOnSingleLine": true, 38 | "NewLinesForBracesInTypes": false, 39 | "NewLinesForBracesInMethods": false, 40 | "NewLinesForBracesInProperties": false, 41 | "NewLinesForBracesInAccessors": false, 42 | "NewLinesForBracesInAnonymousMethods": false, 43 | "NewLinesForBracesInControlBlocks": false, 44 | "NewLinesForBracesInAnonymousTypes": false, 45 | "NewLinesForBracesInObjectCollectionArrayInitializers": false, 46 | "NewLinesForBracesInLambdaExpressionBody": false, 47 | "NewLineForElse": false, 48 | "NewLineForCatch": false, 49 | "NewLineForFinally": false, 50 | "NewLineForMembersInObjectInit": false, 51 | "NewLineForMembersInAnonymousTypes": false, 52 | "NewLineForClausesInQuery": false 53 | } 54 | } -------------------------------------------------------------------------------- /DataScripts/.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # Update the VARIANT arg in devcontainer.json to pick a Python version: 3, 3.8, 3.7, 3.6 2 | # To fully customize the contents of this image, use the following Dockerfile instead: 3 | # https://github.com/microsoft/vscode-dev-containers/tree/v0.112.0/containers/python-3/.devcontainer/base.Dockerfile 4 | ARG VARIANT="3.9" 5 | FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT} 6 | 7 | RUN pip install --upgrade pip 8 | -------------------------------------------------------------------------------- /DataScripts/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.112.0/containers/python-3 3 | { 4 | "name": "Python 3", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "context": "..", 8 | // Update 'VARIANT' to pick a Python version. Rebuild the container 9 | // if it already exists to update. Available variants: 3, 3.6, 3.7, 3.8 10 | "args": { 11 | "VARIANT": "3" 12 | } 13 | }, 14 | // Set *default* container specific settings.json values on container create. 15 | "settings": { 16 | "terminal.integrated.shell.linux": "/bin/bash", 17 | "python.pythonPath": "/usr/local/bin/python", 18 | "python.linting.enabled": true, 19 | "python.linting.pylintEnabled": true, 20 | "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", 21 | "python.formatting.blackPath": "/usr/local/py-utils/bin/black", 22 | "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", 23 | "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", 24 | "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", 25 | "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", 26 | "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", 27 | "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", 28 | "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint", 29 | "python.testing.pytestPath": "/usr/local/py-utils/bin/pytest" 30 | }, 31 | // Add the IDs of extensions you want installed when the container is created. 32 | "extensions": [ 33 | "ms-python.python", 34 | "Azurite.azurite", 35 | "ms-azuretools.vscode-azurestorage", 36 | "formulahendry.code-runner", 37 | "njpwerner.autodocstring", 38 | "ms-python.vscode-pylance" 39 | ], 40 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 41 | // "forwardPorts": [], 42 | "postCreateCommand": "pip install --user -r requirements.txt && python -m spacy download en_core_web_sm", 43 | } -------------------------------------------------------------------------------- /DataScripts/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __azurite_db_blob*.json 3 | __*__ -------------------------------------------------------------------------------- /DataScripts/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations. 3 | // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp 4 | // List of extensions which should be recommended for users of this workspace. 5 | "recommendations": [ 6 | "Azurite.azurite", 7 | "ms-azuretools.vscode-azurestorage", 8 | "ms-python.python", 9 | "njpwerner.autodocstring", 10 | "adrieankhisbe.vscode-ndjson" 11 | ], 12 | // List of extensions recommended by VS Code that should not be recommended for users of this workspace. 13 | "unwantedRecommendations": [] 14 | } -------------------------------------------------------------------------------- /DataScripts/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug", 6 | "type": "python", 7 | "request": "launch", 8 | "program": "app.py", 9 | "console": "integratedTerminal", 10 | "args": [ 11 | "--videos", 12 | "So1i3LA6IIU,L__k9QzfyE" 13 | ] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /DataScripts/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "Identitarian", 4 | "aiohttp", 5 | "appcfg", 6 | "asdict", 7 | "asyncio", 8 | "creds", 9 | "dotenv", 10 | "ents", 11 | "getenv", 12 | "groupby", 13 | "islower", 14 | "iterrows", 15 | "itertools", 16 | "jdbc", 17 | "jsonl", 18 | "jsons", 19 | "lemmatizer", 20 | "nargs", 21 | "sesh", 22 | "spacy", 23 | "spawnl", 24 | "strftime", 25 | "userscrape", 26 | "vids", 27 | "visibles" 28 | ], 29 | "python.formatting.provider": "autopep8", 30 | "python.linting.enabled": true, 31 | "python.formatting.autopep8Args": [ 32 | "--max-line-length", 33 | "160" 34 | ], 35 | "[python]": { 36 | "editor.insertSpaces": true, 37 | "editor.tabSize": 4 38 | }, 39 | "[json]": { 40 | "editor.insertSpaces": true, 41 | "editor.tabSize": 2 42 | }, 43 | "files.exclude": { 44 | "**/.git": true, 45 | "**/.svn": true, 46 | "**/.hg": true, 47 | "**/CVS": true, 48 | "**/.DS_Store": true, 49 | "**/__*__": true, 50 | "__azurite_*.json": true, 51 | }, 52 | "editor.formatOnSave": true, 53 | "python.languageServer": "Pylance", 54 | "python.analysis.typeCheckingMode": "basic", 55 | "python.testing.unittestArgs": [ 56 | "-v", 57 | "-s", 58 | ".", 59 | "-p", 60 | "*_test.py" 61 | ], 62 | "python.testing.pytestEnabled": false, 63 | "python.testing.nosetestsEnabled": false, 64 | "python.testing.unittestEnabled": true 65 | } -------------------------------------------------------------------------------- /DataScripts/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | { 4 | "type": "docker-build", 5 | "label": "docker-build", 6 | "platform": "python", 7 | "dockerBuild": { 8 | "tag": "ytnetworks.azurecr.io/datascripts:latest", 9 | "dockerfile": "${workspaceFolder}/Dockerfile", 10 | "context": "${workspaceFolder}" 11 | }, 12 | "group": { 13 | "kind": "build", 14 | "isDefault": true 15 | } 16 | }, 17 | { 18 | "label": "run docker -it", 19 | "type": "shell", 20 | "command": "docker", 21 | "args": [ 22 | "run", 23 | "-it", 24 | "--env-file", 25 | ".env", 26 | "ytnetworks.azurecr.io/datascripts:latest" 27 | ], 28 | "problemMatcher": [] 29 | } 30 | ], 31 | } -------------------------------------------------------------------------------- /DataScripts/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | RUN pip install --upgrade pip 4 | 5 | # Keeps Python from generating .pyc files in the container 6 | ENV PYTHONDONTWRITEBYTECODE 1 7 | 8 | # Turns off buffering for easier container logging 9 | ENV PYTHONUNBUFFERED 1 10 | 11 | # Install pip requirements 12 | ADD requirements.txt . 13 | RUN pip install -r requirements.txt 14 | 15 | WORKDIR /app 16 | ADD . /app 17 | 18 | RUN python -m spacy download en_core_web_sm 19 | 20 | # During debugging, this entry point will be overridden. For more information, refer to https://aka.ms/vscode-docker-python-debug 21 | CMD ["python", "app.py"] -------------------------------------------------------------------------------- /DataScripts/app.py: -------------------------------------------------------------------------------- 1 | 2 | from video_entities import video_entities 3 | from args import Args, load_args 4 | from log import configure_log 5 | from cfg import load_cfg 6 | import asyncio 7 | 8 | 9 | async def run(args: Args): 10 | '''loads video named entities from a list of video id's in a jsonl.gz file''' 11 | cfg = await load_cfg() 12 | log = configure_log(cfg) 13 | if(cfg.state.videoPaths is None and args.videos is None): 14 | raise Exception('Need either videoPaths or videos') 15 | 16 | log.info('video_entities - {machine} started: {state}', machine=cfg.machine, state=cfg.state.to_json()) 17 | 18 | try: 19 | video_entities(cfg, args, log) 20 | except Exception as e: 21 | log.error("error running video_entities", exc_info=True) 22 | 23 | if __name__ == "__main__": 24 | asyncio.run(run(load_args())) 25 | -------------------------------------------------------------------------------- /DataScripts/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class Args: 8 | """ A list""" 9 | videos: List[str] 10 | 11 | 12 | def load_args(): 13 | parser = argparse.ArgumentParser(description='Run python scripts against the recfluence/ttube warehouse') 14 | parser.add_argument("--videos", "-v", 15 | help="A comma separeted list fo videos to perform named entity recognition on. e.g. So1i3LA6IIU,CL__k9QzfyE. If not specified, videos batch files need to be in the run_state environment variable (the way other application provide a list of video_id's).", 16 | default=None) 17 | args = parser.parse_args() 18 | 19 | return Args(list(args.videos.split(',')) if args.videos else None) 20 | -------------------------------------------------------------------------------- /DataScripts/blobstore.py: -------------------------------------------------------------------------------- 1 | from cfg import StoreCfg 2 | from azure.storage.blob._models import ContentSettings 3 | from azure.storage.blob import ContainerClient, PublicAccess 4 | from azure.core.exceptions import ResourceNotFoundError 5 | from pathlib import PurePath 6 | 7 | 8 | class BlobStore: 9 | def __init__(self, cfg: StoreCfg): 10 | self.cfg = cfg 11 | self.container = ContainerClient.from_connection_string(cfg.dataStorageCs, cfg.container) 12 | 13 | def ensure_container_exits(self, public_access: PublicAccess = None): 14 | """creates the container if it doesn't exist""" 15 | try: 16 | props = self.container.get_container_properties() 17 | except ResourceNotFoundError: 18 | self.container.create_container(public_access=public_access) 19 | except BaseException as e: 20 | raise e 21 | 22 | def save_file(self, localFile: PurePath, remotePath: PurePath, content_type: str = None): 23 | """uploads a local file to the container""" 24 | with open(localFile, 'rb') as f: 25 | blob = self.container.get_blob_client(remotePath.as_posix()) 26 | blob.upload_blob(f, 27 | overwrite=True, 28 | content_settings=ContentSettings(content_type=content_type) if content_type else None) 29 | 30 | def delete(self, path: PurePath): 31 | self.container.delete_blob(path.as_posix()) 32 | -------------------------------------------------------------------------------- /DataScripts/jsonl.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import io 3 | import json 4 | from typing import Any 5 | from datetime import datetime, timezone 6 | 7 | 8 | class JsonlEncoder(json.JSONEncoder): 9 | def default(self, o): 10 | if dataclasses.is_dataclass(o): 11 | return dataclasses.asdict(o) 12 | if isinstance(o, datetime): 13 | # create a standard json formatted datetime 14 | return o.strftime('%Y-%m-%dT%H:%M:%S.%fZ') if o.tzinfo is None or o.tzinfo == timezone.utc else o.isoformat() 15 | return super().default(o) 16 | 17 | def encode(self, obj, *args, **kwargs): 18 | lines = [] 19 | for each in obj: 20 | line = super(JsonlEncoder, self).encode(each, *args, **kwargs) 21 | lines.append(line) 22 | return '\n'.join(lines) 23 | 24 | 25 | def write_jsonl(obj: Any, fp: io.IOBase, cls=None, **kwargs): 26 | text = to_jsonl(obj, cls, **kwargs) 27 | if(isinstance(fp, (io.RawIOBase, io.BufferedIOBase))): 28 | fp.write(text.encode()) 29 | else: 30 | fp.write(text) 31 | 32 | 33 | def to_jsonl(obj: Any, cls=None, **kwargs): 34 | if cls is None: 35 | cls = JsonlEncoder 36 | return cls(**kwargs).encode(obj) 37 | -------------------------------------------------------------------------------- /DataScripts/log.py: -------------------------------------------------------------------------------- 1 | from cfg import Cfg 2 | import seqlog 3 | import logging 4 | 5 | 6 | def configure_log(cfg: Cfg) -> logging.Logger: 7 | seqlog.configure_from_dict({ 8 | 'version': 1, 9 | 'disable_existing_loggers': True, 10 | 'root': { 11 | 'level': 'WARN', 12 | 'handlers': ['console'] 13 | }, 14 | 'loggers': { 15 | 'seq': { 16 | 'level': 'DEBUG', 17 | 'handlers': ['seq'], 18 | 'propagate': True 19 | } 20 | }, 21 | 'handlers': { 22 | 'console': { 23 | 'class': 'seqlog.structured_logging.ConsoleStructuredLogHandler', 24 | 'formatter': 'seq' 25 | }, 26 | 'seq': { 27 | 'class': 'seqlog.structured_logging.SeqLogHandler', 28 | 'server_url': cfg.seq.seqUrl, 29 | 'batch_size': 10, 30 | 'auto_flush_timeout': 2, 31 | 'formatter': 'seq' 32 | } 33 | }, 34 | 'formatters': { 35 | 'seq': { 36 | 'style': '{' 37 | } 38 | } 39 | }) 40 | 41 | seqlog.set_global_log_properties( 42 | App="DataScripts", 43 | Env=cfg.env or '', 44 | BranchEnv=cfg.branchEnv or '', 45 | Machine=cfg.machine or '' 46 | ) 47 | 48 | return logging.getLogger('seq') 49 | -------------------------------------------------------------------------------- /DataScripts/readme.md: -------------------------------------------------------------------------------- 1 | ## Initial Setup 2 | 3 | ### 1 - Run the following in this directory 4 | `pip install --user -r requirements.txt` 5 | 6 | Alternatively (if you have docker & vscode), open this folder in the devcontainer. 7 | 8 | ### 2 - configure `.env` file 9 | create a `.env` file in this directory with the follwing variables 10 | ``` 11 | cfg_sas= 12 | branch_env= 13 | run_state= 14 | ``` 15 | 16 | **cfg_sas** 17 | This is a a url to a configuration file. 18 | 19 | **run_state** 20 | Json with paths to video batches. If you are running interactively, this is not needed. 21 | 22 | **branch_env** 23 | leave blank to use prod environment. If specified, will use seperate stores/warehouses etc.. when you need an isolated environment. This seperate environment is created using `recfluence create-env` (not documented yet). 24 | 25 | ### 3 - Run 26 | Run `python app.py -h` for help 27 | 28 | 29 | -------------------------------------------------------------------------------- /DataScripts/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | dataclasses 3 | dataclasses_json 4 | dataclasses_jsonschema 5 | pandas 6 | seqlog 7 | spacy==2.3.5 8 | setuptools 9 | wheel 10 | snowflake-connector-python 11 | aiohttp 12 | azure-storage-blob 13 | ndjson 14 | -------------------------------------------------------------------------------- /DataScripts/sf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from snowflake.connector.connection import SnowflakeConnection 3 | from cfg import SnowflakeCfg 4 | import snowflake.connector 5 | 6 | 7 | def sf_connect(cfg: SnowflakeCfg): 8 | 9 | creds = cfg.creds.split(':') 10 | account = '.'.join(cfg.host.split('.')[:3]) 11 | 12 | # Gets the version 13 | ctx = snowflake.connector.connect( 14 | user=creds[0], 15 | password=creds[1], 16 | account=account, 17 | warehouse=cfg.warehouse, 18 | database=cfg.db, 19 | schema=cfg.schema, 20 | role=cfg.role 21 | ) 22 | 23 | return ctx 24 | 25 | 26 | def sf_test(db: SnowflakeConnection): 27 | cs = db.cursor() 28 | try: 29 | cs.execute("SELECT current_version()") 30 | one_row = cs.fetchone() 31 | print(one_row[0]) 32 | finally: 33 | cs.close() 34 | -------------------------------------------------------------------------------- /Dataform/.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.134.1/containers/alpine/.devcontainer/base.Dockerfile 2 | ARG VARIANT="3.12" 3 | FROM mcr.microsoft.com/vscode/devcontainers/base:0-alpine-${VARIANT} 4 | 5 | # ** [Optional] Uncomment this section to install additional packages. ** 6 | RUN apk update && apk add --no-cache \ 7 | npm nodejs 8 | 9 | RUN npm i -g @dataform/cli typescript ts-node 10 | -------------------------------------------------------------------------------- /Dataform/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.134.1/containers/alpine 3 | { 4 | "name": "Alpine", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | // Update 'VARIANT' to pick an Alpine version: 3.10, 3.11, 3.12 8 | "args": { "VARIANT": "3.12" } 9 | }, 10 | 11 | // Set *default* container specific settings.json values on container create. 12 | "settings": { 13 | "terminal.integrated.shell.linux": "/bin/ash" 14 | }, 15 | 16 | "extensions": [], 17 | 18 | // Use 'postCreateCommand' to run commands after the container is created. 19 | "postCreateCommand": "npm i" 20 | } -------------------------------------------------------------------------------- /Dataform/.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | node_modules/* 3 | .run -------------------------------------------------------------------------------- /Dataform/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .df-credentials.json 3 | .run -------------------------------------------------------------------------------- /Dataform/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug", 6 | "type": "node", 7 | "request": "launch", 8 | "args": ["${workspaceRoot}/src/run.ts"], 9 | "runtimeArgs": ["-r", "ts-node/register"], 10 | "cwd": "${workspaceRoot}", 11 | "protocol": "inspector", 12 | "internalConsoleOptions": "openOnSessionStart", 13 | "env": { 14 | "TS_NODE_IGNORE": "false" 15 | }, 16 | "outputCapture": "std" 17 | }, 18 | { 19 | "name": "Debug (prompt)", 20 | "type": "node", 21 | "request": "launch", 22 | "args": ["${workspaceRoot}/src/run.ts", "-d", "\"${input:dataform_args}\""], 23 | "runtimeArgs": ["-r", "ts-node/register"], 24 | "cwd": "${workspaceRoot}", 25 | "protocol": "inspector", 26 | "internalConsoleOptions": "openOnSessionStart", 27 | "env": { 28 | "TS_NODE_IGNORE": "false" 29 | }, 30 | "outputCapture": "std" 31 | }, 32 | ], 33 | "inputs": [ 34 | { 35 | "id": "dataform_args", 36 | "description": "Anter a list of actions to run e.g: \"--actions REC\", \"--tags standard\"", 37 | "type": "promptString" 38 | } 39 | ] 40 | } -------------------------------------------------------------------------------- /Dataform/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.exclude": { 3 | ".run/**": true, 4 | "node_modules/**": true 5 | }, 6 | "cSpell.words": [ 7 | "Dataform", 8 | "Dataform's", 9 | "basepath", 10 | "promisify", 11 | "serializers" 12 | ] 13 | } -------------------------------------------------------------------------------- /Dataform/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "build docker", 8 | "type": "shell", 9 | "command": "docker", 10 | "args": [ 11 | "build", 12 | "-f", 13 | "Dockerfile", 14 | "-t", 15 | "ytnetworks.azurecr.io/dataform:latest", 16 | "." 17 | ], 18 | "problemMatcher": [], 19 | "group": "build" 20 | }, 21 | { 22 | "label": "run docker -it", 23 | "type": "shell", 24 | "command": "docker", 25 | "args": [ 26 | "run", 27 | "-it", 28 | "--env-file", 29 | ".env", 30 | "ytnetworks.azurecr.io/dataform:latest", 31 | "bin/sh" 32 | ], 33 | "problemMatcher": [] 34 | }, 35 | { 36 | "label": "run docker", 37 | "type": "shell", 38 | "command": "docker", 39 | "args": [ 40 | "run", 41 | "--env-file", 42 | ".env", 43 | "ytnetworks.azurecr.io/dataform:latest" 44 | ], 45 | "problemMatcher": [] 46 | }, 47 | { 48 | "label": "push docker", 49 | "type": "shell", 50 | "command": "docker", 51 | "args": [ 52 | "push", 53 | "ytnetworks.azurecr.io/dataform:latest" 54 | ], 55 | "problemMatcher": [] 56 | }, 57 | { 58 | "label": "build tsc", 59 | "group": "build", 60 | "type": "shell", 61 | "command": "tsc", 62 | "args": ["--skipLibCheck"], 63 | "problemMatcher": [ "$tsc" ] 64 | } 65 | ] 66 | } -------------------------------------------------------------------------------- /Dataform/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.13 2 | 3 | RUN apk add --no-cache --update nodejs nodejs-npm git 4 | RUN npm i -g @dataform/cli typescript ts-node 5 | 6 | COPY package*.json ./ 7 | COPY tsconfig.json ./ 8 | RUN npm i --production 9 | COPY src ./src 10 | 11 | RUN tsc --skipLibCheck 12 | RUN rm -rf /root/.npm/_cacache/ 13 | 14 | CMD ["ts-node-script", "src/run.ts"] -------------------------------------------------------------------------------- /Dataform/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataform", 3 | "version": "1.0.0", 4 | "description": "run a dataform update from a repo", 5 | "main": "run.ts", 6 | "scripts": { 7 | "build": "tsc --skipLibCheck", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "author": "Mark Ledwich", 11 | "license": "MIT", 12 | "dependencies": { 13 | "@dataform/cli": "^1.17.2", 14 | "bunyan": "^1.8.14", 15 | "bunyan-debug-stream": "^2.0.0", 16 | "bunyan-seq": "^0.4.0", 17 | "dateformat": "^3.0.3", 18 | "dotenv": "^8.2.0", 19 | "humanize-duration": "^3.25.0", 20 | "lodash": "^4.17.20", 21 | "promisify-child-process": "^4.1.1", 22 | "strip-ansi": "^6.0.0", 23 | "ts-node": "^8.10.2", 24 | "typescript": "^3.9.7", 25 | "yargs": "^15.4.1" 26 | }, 27 | "devDependencies": { 28 | "@types/lodash": "^4.14.166", 29 | "@types/yargs": "^15.0.12", 30 | "@types/bunyan": "^1.8.6", 31 | "@types/bunyan-seq": "^0.2.2", 32 | "@types/dateformat": "^3.0.1", 33 | "@types/humanize-duration": "^3.18.1" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Dataform/src/NodeTypings.d.ts: -------------------------------------------------------------------------------- 1 | declare namespace NodeJS { 2 | 3 | // Merge the existing `ProcessEnv` definition with ours 4 | // https://www.typescriptlang.org/docs/handbook/declaration-merging.html#merging-interfaces 5 | export interface ProcessEnv { 6 | SNOWFLAKE_JSON: string 7 | REPO: string 8 | // this is the repo branch (not the branch environment) 9 | BRANCH: string 10 | DATAFORM_RUN_ARGS: string 11 | SEQ:string 12 | } 13 | } -------------------------------------------------------------------------------- /Dataform/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": [ 3 | "src/**/*" 4 | ], 5 | "exclude": [ 6 | "node_modules", 7 | "**/*.spec.ts" 8 | ], 9 | "compilerOptions": { 10 | "target": "ES2015", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */ 11 | "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */ 12 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 13 | "forceConsistentCasingInFileNames": true, 14 | "sourceMap": true 15 | } 16 | } -------------------------------------------------------------------------------- /Env/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.112.0/containers/azure-cli 3 | { 4 | "name": "Azure CLI", 5 | "dockerFile": "Dockerfile", 6 | 7 | // Set *default* container specific settings.json values on container create. 8 | "settings": { 9 | "terminal.integrated.shell.linux": "/bin/bash" 10 | }, 11 | 12 | // Add the IDs of extensions you want installed when the container is created. 13 | "extensions": [ 14 | "ms-vscode.azurecli" 15 | ] 16 | 17 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 18 | // "forwardPorts": [], 19 | 20 | // Use 'postCreateCommand' to run commands after the container is created. 21 | // "postCreateCommand": "az --version", 22 | 23 | // Uncomment when using a ptrace-based debugger like C++, Go, and Rust 24 | // "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ], 25 | 26 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. 27 | // "remoteUser": "vscode" 28 | 29 | } -------------------------------------------------------------------------------- /Env/backrup_db2.azcli: -------------------------------------------------------------------------------- 1 | 2 | # ML 2020-05-14 Not in prod use. 3 | 4 | 5 | # get SAS uri's for source and dest 6 | 7 | az login 8 | 9 | 10 | from_group=ytnetworks 11 | from_account=pyt 12 | from_container=testdata 13 | from_path="db2/*" 14 | 15 | to_group=dev 16 | to_account=mutuodev 17 | to_container=backup 18 | date=`date +%Y-%m-%dT%H:%MZ` 19 | to_path="db2/${date}" 20 | 21 | now=`date +"%Y-%m-%dT%H:%M:00Z"` 22 | expiry=`date -d "$now + 3 days" +"%Y-%m-%dT%H:%M:00Z"` 23 | 24 | 25 | 26 | #SASa=`az storage container generate-sas --name $srcCon --start $NOW --expiry $EXPIRY --permissions rl --account-name $srcAcc --account-key $srcKey --output tsv` 27 | 28 | expiry=`date -u -d "4 days" '+%Y-%m-%dT%H:%M:%SZ'` 29 | 30 | backupdir=`date +%Y-%m-%dT%H:%MZ` 31 | 32 | 33 | from_cs=`az storage account show-connection-string --name $from_account --resource-group $from_group --output tsv` 34 | from_sas=`az storage container generate-sas --connection-string $from_cs --name $from_container --permissions rl --expiry $expiry --start $now --output tsv` 35 | 36 | to_cs=`az storage account show-connection-string --name $to_account --resource-group $to_group --output tsv` 37 | to_sas=`az storage container generate-sas --connection-string $to_cs --name $to_container --permissions acdrw --expiry $expiry --start $now --output tsv` 38 | 39 | 40 | #azcopy copy --help 41 | from_url="https://${from_account}.blob.core.windows.net/${from_container}/${from_path}?${from_sas}" 42 | to_url="https://${to_account}.blob.core.windows.net/${to_container}/${to_path}?${to_sas}" 43 | 44 | azcopy copy $from_url $to_url --overwrite=prompt --recursive 45 | 46 | 47 | azcopy copy "https://pyt.blob.core.windows.net/testdata/db2/*?sv=2018-03-28&ss=b&srt=co&sp=rl&se=2020-05-21T00%3A38%3A35Z&sig=jRygPycCgpVMPBEF3%2FxT7OAe4yuuvjsXgEoqT4zPg2U%3D" "https://mutuodev.blob.core.windows.net/backup/2020-05-14-manual/db2?se=2020-06-13T00%3A39%3A13Z&sp=rwl&sv=2018-03-28&sr=c&sig=SXzFVugUkGwRnrQC0Dhrtk%2B5GsZDEhgWpF7K8tvyPWI%3D" --overwrite=prompt --recursive --s2s-preserve-access-tier=false; 48 | 49 | -------------------------------------------------------------------------------- /Env/create_seq.azcli: -------------------------------------------------------------------------------- 1 | az login 2 | 3 | # Change these four parameters as needed 4 | ACI_PERS_RESOURCE_GROUP=ytnetworks 5 | ACI_PERS_STORAGE_ACCOUNT_NAME=seq13125 6 | ACI_PERS_LOCATION=westus2 7 | NAME=seq 8 | 9 | # Create the storage account with the parameters 10 | az storage account create \ 11 | --resource-group $ACI_PERS_RESOURCE_GROUP \ 12 | --name $ACI_PERS_STORAGE_ACCOUNT_NAME \ 13 | --location $ACI_PERS_LOCATION \ 14 | --sku Standard_LRS 15 | 16 | # Create the file share 17 | az storage share create \ 18 | --name $NAME \ 19 | --account-name $ACI_PERS_STORAGE_ACCOUNT_NAME 20 | 21 | STORAGE_KEY=$(az storage account keys list --resource-group $ACI_PERS_RESOURCE_GROUP --account-name $ACI_PERS_STORAGE_ACCOUNT_NAME --query "[0].value" --output tsv) 22 | echo $STORAGE_KEY 23 | 24 | MOUNT_PATH=/data 25 | az container create --resource-group $ACI_PERS_RESOURCE_GROUP --name $NAME -l $ACI_PERS_LOCATION --image datalust/seq:latest --dns-name-label ytseq --ports 80 443 -e ACCEPT_EULA=Y --azure-file-volume-account-name $ACI_PERS_STORAGE_ACCOUNT_NAME --azure-file-volume-account-key $STORAGE_KEY --azure-file-volume-share-name $NAME --azure-file-volume-mount-path $MOUNT_PATH --memory 4 --cpu 1 26 | 27 | #az container restart -n $NAME -g $ACI_PERS_RESOURCE_GROUP 28 | az container stop -n $NAME --resource-group $ACI_PERS_RESOURCE_GROUP 29 | #az container delete -n $NAME --resource-group $ACI_PERS_RESOURCE_GROUP 30 | #curl -XPOST "http://ytseq.westus2.azurecontainer.io/api/events/raw?clef" -d "{'@t':'2018-06-07T03:44:57.8532799Z','@mt':'Hello, {User}','User':'alice'}" -------------------------------------------------------------------------------- /Env/github actions.azcli: -------------------------------------------------------------------------------- 1 | az account show --subscription 2 | 3 | az ad sp list 4 | 5 | az ad sp create-for-rbac --name "recfluence-github" --role contributor \ 6 | --scopes /subscriptions/93ef8661-3e11-4fcc-92ae-d0383e7e4ad3/resourceGroups/ytnetworks \ 7 | --sdk-auth 8 | 9 | # Replace {subscription-id}, {resource-group} with the subscription, resource group details 10 | 11 | # The command should output a JSON object similar to this: 12 | 13 | { 14 | "clientId": "", 15 | "clientSecret": "", 16 | "subscriptionId": "", 17 | "tenantId": "", 18 | (...) 19 | } -------------------------------------------------------------------------------- /Env/prod_update.azcli: -------------------------------------------------------------------------------- 1 | az login 2 | 3 | az configure --defaults location=westus2 group=ytnetworks 4 | registryPass=`az keyvault secret show --name "ytnetworks-registry-password" --vault-name "recfluence" --output json | jq -r '.value'` 5 | cs=`az storage account show-connection-string --name ytapp --output tsv` 6 | 7 | 8 | # update 9 | az container create --name update-manual --image ytnetworks.azurecr.io/recfluence:latest --cpu 2 --memory 4 \ 10 | --command-line "./recfluence update" \ 11 | --environment-variables env=prod appStoreCs=$cs \ 12 | --registry-username ytnetworks --registry-password $registryPass \ 13 | --restart-policy Never 14 | 15 | az container delete --name update-manual 16 | 17 | 18 | # userscrape 19 | 20 | sas_end=`date -u -d "120 minutes" '+%Y-%m-%dT%H:%MZ'` 21 | us_cfg_sas=`az storage blob generate-sas --connection-string $cs --container-name cfg --name userscrape.json --permissions r --expiry $sas_end --full-uri --output tsv` 22 | az container create --name userscrape-manual3 --image ytnetworks.azurecr.io/userscrape:latest --cpu 1 --memory 2 \ 23 | --command-line "python app.py -t 2020-09-07_21-21-36_3VW" \ 24 | --environment-variables cfg_sas=$us_cfg_sas env=prod \ 25 | --registry-username ytnetworks --registry-password $registryPass \ 26 | --restart-policy Never 27 | 28 | 29 | 30 | # scheduling update. Decided against because it's new and can't wor out how to run/send parameters 31 | az acr task create \ 32 | --name recfluence \ 33 | --registry ytnetworks \ 34 | --schedule "0 0 * * *" \ 35 | --context /dev/null \ 36 | --cmd "ytnetworks.azurecr.io/userscrape:latest ./recfluence update -a Search" 37 | az acr task run -r ytnetworks -n recfluence 38 | az acr task delete -r ytnetworks -n recfluence 39 | -------------------------------------------------------------------------------- /Env/recfluence_update.azcli: -------------------------------------------------------------------------------- 1 | az login 2 | 3 | az configure --defaults location=westus2 group=ytnetworks 4 | registryPass=`az keyvault secret show --name "ytnetworks-registry-password" --vault-name "recfluence" --output json | jq -r '.value'` 5 | cs=`az storage account show-connection-string --name ytapp --output tsv` 6 | 7 | az container delete --name update-manual 8 | 9 | az container create --name update-manual --image ytnetworks.azurecr.io/recfluence:latest --cpu 1 --memory 1 \ 10 | --command-line "./recfluence update -a Dataform" \ 11 | --environment-variables env=prod appStoreCs=$cs \ 12 | --registry-username ytnetworks --registry-password $registryPass \ 13 | --restart-policy Never 14 | 15 | 16 | az container start --name dataform 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Mark Ledwich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Site/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | "@babel/plugin-proposal-optional-chaining", 4 | "@babel/plugin-proposal-nullish-coalescing-operator" 5 | ], 6 | "presets": [ 7 | [ 8 | "babel-preset-gatsby", 9 | { 10 | "targets": { 11 | "browsers": [ 12 | ">0.25%", 13 | "not dead" 14 | ] 15 | } 16 | } 17 | ] 18 | ] 19 | } -------------------------------------------------------------------------------- /Site/.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.209.6/containers/typescript-node/.devcontainer/base.Dockerfile 2 | 3 | # [Choice] Node.js version (use -bullseye variants on local arm64/Apple Silicon): 16, 14, 12, 16-bullseye, 14-bullseye, 12-bullseye, 16-buster, 14-buster, 12-buster 4 | ARG VARIANT="16-bullseye" 5 | FROM mcr.microsoft.com/vscode/devcontainers/typescript-node:0-${VARIANT} 6 | 7 | # [Optional] Uncomment this section to install additional OS packages. 8 | # RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 9 | # && apt-get -y install --no-install-recommends 10 | 11 | # [Optional] Uncomment if you want to install an additional version of node using nvm 12 | # ARG EXTRA_NODE_VERSION=10 13 | # RUN su node -c "source /usr/local/share/nvm/nvm.sh && nvm install ${EXTRA_NODE_VERSION}" 14 | 15 | # [Optional] Uncomment if you want to install more global node packages 16 | # RUN su node -c "npm install -g " 17 | -------------------------------------------------------------------------------- /Site/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.209.6/containers/typescript-node 3 | { 4 | "name": "Node.js & TypeScript", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | // Update 'VARIANT' to pick a Node version: 16, 14, 12. 8 | // Append -bullseye or -buster to pin to an OS version. 9 | // Use -bullseye variants on local on arm64/Apple Silicon. 10 | "args": { 11 | "VARIANT": "12" 12 | } 13 | }, 14 | // Set *default* container specific settings.json values on container create. 15 | "settings": {}, 16 | // Add the IDs of extensions you want installed when the container is created. 17 | "extensions": [ 18 | "dbaeumer.vscode-eslint" 19 | ] 20 | } -------------------------------------------------------------------------------- /Site/.env.development: -------------------------------------------------------------------------------- 1 | RESULTS_HOST=https://ytapp.blob.core.windows.net 2 | RESULTS_CONTAINER=public 3 | RESULTS_PATH=results 4 | FUNC_URL=http://host.docker.internal:7071/api/ 5 | AUTH0_DOMAIN=recfluence.auth0.com 6 | AUTH0_CLIENTID=5xMPNVfzs1m6P14tu9hjsTgljplW77cI 7 | ES_URL=https://recfluence.azurewebsites.net/api/es/ 8 | BRANCH_ENV= -------------------------------------------------------------------------------- /Site/.env.production: -------------------------------------------------------------------------------- 1 | RESULTS_HOST=https://ytapp.blob.core.windows.net 2 | RESULTS_CONTAINER=public 3 | RESULTS_PATH=results 4 | FUNC_URL=https://recfluence.azurewebsites.net/api/ 5 | AUTH0_DOMAIN=recfluence.auth0.com 6 | AUTH0_CLIENTID=5xMPNVfzs1m6P14tu9hjsTgljplW77cI 7 | ES_URL=https://recfluence.azurewebsites.net/api/es/ 8 | BRANCH_ENV= -------------------------------------------------------------------------------- /Site/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (http://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # Typescript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | .cache/ 61 | public 62 | yarn-error.log 63 | 64 | __*__ 65 | __*__.json -------------------------------------------------------------------------------- /Site/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "chrome", 9 | "request": "launch", 10 | "name": "Launch Chrome against localhost", 11 | "url": "http://localhost:8000/review", 12 | "webRoot": "${workspaceFolder}" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /Site/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.exclude": { 3 | "__*__.json": true, 4 | "__*__": true, 5 | ".cache": true, 6 | "node_modules": true 7 | }, 8 | "editor.formatOnSave": true, 9 | "cSpell.words": [ 10 | "Identitarian", 11 | "Intersectionality", 12 | "Jsonl", 13 | "Kavanaugh", 14 | "Zaitsev", 15 | "aggs", 16 | "algoliasearch", 17 | "appbaseio", 18 | "authed", 19 | "dcolor", 20 | "lucene", 21 | "reactivesearch" 22 | ], 23 | "typescript.preferences.quoteStyle": "single", 24 | "javascript.format.semicolons": "remove", 25 | "typescript.format.semicolons": "remove", 26 | "editor.tabSize": 2, 27 | "[json]": { 28 | "editor.defaultFormatter": "vscode.json-language-features" 29 | }, 30 | "[typescriptreact]": { 31 | "editor.defaultFormatter": "vscode.typescript-language-features", 32 | }, 33 | "[typescript]": { 34 | "editor.defaultFormatter": "vscode.typescript-language-features" 35 | }, 36 | "[html]": { 37 | "editor.defaultFormatter": "vscode.html-language-features" 38 | }, 39 | "javascript.preferences.quoteStyle": "single", 40 | "prettier.jsxSingleQuote": true, 41 | "prettier.singleQuote": true, 42 | "typescript.tsdk": "node_modules\\typescript\\lib", 43 | "debug.javascript.warnOnLongPrediction": false, 44 | "workbench.colorCustomizations": { 45 | "[Default Dark+]": { 46 | "activityBar.background": "#1c2b1c" 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /Site/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "develop", 8 | "type": "npm", 9 | "script": "develop", 10 | "problemMatcher": [ 11 | "$eslint-stylish" 12 | ], 13 | "group": "build" 14 | }, 15 | { 16 | "label": "build", 17 | "type": "npm", 18 | "script": "build --prefix-paths", 19 | "problemMatcher": [ 20 | "$eslint-stylish" 21 | ], 22 | "group": "build" 23 | }, 24 | { 25 | "label": "serve", 26 | "type": "npm", 27 | "script": "serve", 28 | "problemMatcher": [ 29 | "$eslint-stylish" 30 | ] 31 | } 32 | ] 33 | } -------------------------------------------------------------------------------- /Site/Logo.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Site/Logo.pptx -------------------------------------------------------------------------------- /Site/gatsby-config.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = { 3 | siteMetadata: { 4 | title: `Recfluence` 5 | }, 6 | plugins: [ 7 | `gatsby-plugin-react-helmet`, 8 | `gatsby-plugin-styled-components`, 9 | `gatsby-plugin-netlify`, 10 | { 11 | resolve: `gatsby-plugin-typescript`, 12 | options: { 13 | isTSX: true, // defaults to false 14 | allExtensions: true, // defaults to false 15 | }, 16 | }, 17 | { 18 | resolve: `gatsby-plugin-google-analytics`, 19 | options: { 20 | trackingId: "UA-130770302-1", 21 | head: true, 22 | } 23 | }, 24 | { 25 | resolve: `gatsby-plugin-create-client-paths`, 26 | options: { prefixes: [`/video/*`] }, 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /Site/gatsby-node.js: -------------------------------------------------------------------------------- 1 | exports.onCreateWebpackConfig = ({ stage, actions }) => { 2 | if (stage == 'develop') { 3 | actions.setWebpackConfig({ 4 | node: { 5 | fs: 'empty' 6 | }, 7 | devtool: "eval-source-map" 8 | }) 9 | } 10 | } -------------------------------------------------------------------------------- /Site/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | roots: ['/src'], 3 | preset: 'ts-jest', 4 | testEnvironment: 'node' 5 | }; -------------------------------------------------------------------------------- /Site/src/common/DbModel.ts: -------------------------------------------------------------------------------- 1 | import { ChannelData } from "./YtModel" 2 | import { parseISO } from "date-fns" 3 | import { getJson } from "./Utils" 4 | import _ from 'lodash' 5 | 6 | export class DbModel { 7 | static ChannelData(dbChannel: any): ChannelData { 8 | const c = _.mapKeys(dbChannel, (_, k) => k.toLowerCase()) 9 | 10 | try { 11 | return { 12 | channelId: c.channel_id, 13 | title: c.channel_title, 14 | tags: c.tags ? JSON.parse(c.tags) : [], 15 | subCount: c.subs, 16 | channelVideoViews: +c.channel_views, 17 | thumbnail: c.logo_url, 18 | lr: c.lr, 19 | publishedFrom: parseISO(c.from_date), 20 | publishedTo: parseISO(c.to_date), 21 | dailyViews: +c.video_views_daily, 22 | relevantDailyViews: +c.relevant_video_views_daily, 23 | views: +c.channel_views, 24 | relevantImpressionsDaily: c.relevant_impressions_daily, 25 | relevantImpressionsDailyIn: c.relevant_impressions_in_daily, 26 | ideology: c.ideology, 27 | media: c.media, 28 | relevance: c.relevance, 29 | lifetimeDailyViews: c.channel_lifetime_daily_views ? 30 | +c.channel_lifetime_daily_views : null 31 | } 32 | } 33 | catch (ex) { 34 | console.log('error parsing channel row', JSON.stringify(c)) 35 | throw ex 36 | } 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /Site/src/common/Elastic.ts: -------------------------------------------------------------------------------- 1 | 2 | const regex = /\w+|"[^"]+"/g 3 | 4 | export function queryHighlights(query: string): string[] { 5 | try { 6 | const matches = query.match(regex) 7 | var i = matches.length 8 | while (i--) matches[i] = matches[i].replace(/"/g, "") 9 | return matches 10 | } 11 | catch (error) { 12 | console.log(`error parsing query ${query}: ${error}`) 13 | return [] 14 | } 15 | } 16 | 17 | export function esCfgFromEnv(): EsCfg { 18 | const prefix = process.env.BRANCH_ENV ? `${process.env.BRANCH_ENV}-` : '' 19 | return { 20 | url: process.env.ES_URL, 21 | prefix: process.env.ES_PREFIX, 22 | indexes: { 23 | caption: `${prefix}caption-2`, 24 | channel: `${prefix}channel-2`, 25 | channelTitle: `${prefix}channel_title-2`, 26 | video: `${prefix}video-2` 27 | } 28 | } 29 | } 30 | 31 | export interface EsCfg { 32 | url: string 33 | prefix: string 34 | indexes: EsIndexes 35 | } 36 | 37 | export interface EsIndexes { 38 | caption: string 39 | video: string 40 | channel: string 41 | channelTitle: string 42 | } 43 | 44 | export interface EsSearchRes { 45 | hits: { hits: EsDocRes[] } 46 | } 47 | 48 | export interface EsDocRes { 49 | found: boolean 50 | _source: T 51 | } 52 | 53 | export interface EsDocsRes { 54 | docs: EsDocRes[] 55 | } -------------------------------------------------------------------------------- /Site/src/common/Uri.spec.ts: -------------------------------------------------------------------------------- 1 | import { uri } from './Uri' 2 | 3 | test('part appended', () => { 4 | expect(uri('http://test.com/folder').addPath('file.json').url).toBe('http://test.com/folder/file.json'), 5 | expect(uri('http://test.com/folder/').addPath('file.json').url).toBe('http://test.com/folder/file.json'), 6 | expect( 7 | uri('https://test.com') 8 | .with({ 9 | host: 'testing.com' 10 | }) 11 | .addQuery({ filter: 'q' }) 12 | .with({ 13 | username: 'mark', 14 | password: 'pass', 15 | port: 7071, 16 | path: ['a', 'b'] 17 | }) 18 | .addQuery({ prop: 's' }) 19 | .addPath('c') 20 | .url 21 | ).toBe('https://mark:pass@testing.com:7071/a/b/c?filter=q&prop=s') 22 | }) 23 | -------------------------------------------------------------------------------- /Site/src/common/Uri.ts: -------------------------------------------------------------------------------- 1 | 2 | import _, { toNumber } from 'lodash' 3 | 4 | interface UriParts { 5 | uri?: string 6 | protocol?: string 7 | host?: string 8 | port?: number 9 | username?: string 10 | password?: string 11 | path?: string[] 12 | query?: { [key: string]: string } 13 | hash?: string 14 | } 15 | 16 | export class Uri { 17 | parts: UriParts 18 | 19 | constructor(uri: string | UriParts) { 20 | this.parts = typeof uri === 'string' ? this.asUri(uri) : uri 21 | } 22 | 23 | asUri = (url: string): UriParts => { 24 | const p = new URL(url) 25 | 26 | const q: { [key: string]: string } = {} 27 | for (const [key, value] of p.searchParams.entries()) 28 | q[key] = value 29 | 30 | return { 31 | protocol: p.protocol, 32 | host: p.host, 33 | port: p.port ? toNumber(p.port) : null, 34 | username: p.username, 35 | password: p.password, 36 | path: p.pathname?.split('/').filter(p => p), 37 | query: q, 38 | hash: p.hash 39 | } 40 | }; 41 | 42 | asUrl = (p: UriParts): URL => { 43 | const u = new URL(`${p.protocol ?? '_:'}//${p.host ?? '_'}`) 44 | if (p.port) u.port = p.port.toString() 45 | if (p.username) u.username = p.username 46 | if (p.password) u.password = p.password 47 | if (p.path?.length > 0) u.pathname = p.path.join('/') 48 | if (p.hash) u.hash = p.hash 49 | if (p.query) 50 | for (const [key, value] of Object.entries(p.query)) 51 | u.searchParams.append(key, value) 52 | return u 53 | }; 54 | 55 | with = (parts: UriParts) => new Uri(Object.assign({}, this.parts, parts)) 56 | 57 | /** appends the given query string values to the existing, returns the new Uri */ 58 | addQuery = (q: { [key: string]: string }) => this.with({ query: Object.assign({}, this.parts.query, q) }) 59 | 60 | /** appends a path part to the existing one. returns a new Uri */ 61 | addPath = (...path: string[]) => this.with({ path: (this.parts.path ?? []).concat(path) }) 62 | 63 | get url() { return this.asUrl(this.parts).toString() } 64 | public toString() { return this.url }; 65 | } 66 | 67 | /** creates a Uri object for fluently building/modifying a url */ 68 | export function uri(uri: string) { 69 | return new Uri(uri) 70 | } -------------------------------------------------------------------------------- /Site/src/components/Button.tsx: -------------------------------------------------------------------------------- 1 | import React, { useState, PropsWithChildren, MouseEventHandler, CSSProperties } from "react" 2 | import styled from 'styled-components' 3 | import { ytTheme } from './MainLayout' 4 | import { StyledIconBase } from '@styled-icons/styled-icon' 5 | 6 | interface ButtonProps { 7 | label?: string 8 | icon?: JSX.Element 9 | onclick: MouseEventHandler 10 | primary?: boolean 11 | } 12 | 13 | const ButtonStyle = styled.button` 14 | display:flex; 15 | 16 | text-transform: uppercase; 17 | text-align: center; 18 | 19 | 20 | border: none; 21 | cursor: pointer; 22 | align-self: center; 23 | color: ${ytTheme.fontColor}; 24 | 25 | background-color: ${p => p.primary ? ytTheme.themeColorSubtler : ytTheme.backColorBolder}; 26 | :hover { 27 | background-color: ${ytTheme.backColorBolder3}; 28 | } 29 | font-size: 1em; 30 | line-height: 1em; 31 | padding: .5em 1em 0.5em 1em; 32 | border-radius: 0.2em; 33 | outline: none; 34 | font-weight: bolder; 35 | 36 | ${StyledIconBase} { 37 | height: 1.4em; 38 | width: 1.4em; 39 | position: relative; 40 | top: -0.15em; 41 | padding-right: 0.2em; 42 | } 43 | ` 44 | 45 | export const Button = ({ label, icon, onclick, primary }: ButtonProps) => {icon} {label} 46 | 47 | export const inlineButtonStyle: CSSProperties = { 48 | height: '1.4em', 49 | width: '1.4em', 50 | position: 'relative', 51 | top: '-0.10em', 52 | paddingRight: '0.2em' 53 | } -------------------------------------------------------------------------------- /Site/src/components/OutsideClick.tsx: -------------------------------------------------------------------------------- 1 | import React, { MutableRefObject } from 'react' 2 | import { useEffect } from "react" 3 | 4 | const useOutsideClick = (ref: MutableRefObject, callback: (e: Element) => void) => { 5 | const handleClick = (e: any) => { 6 | if (ref.current && !ref.current.contains(e.target)) callback(e.target) 7 | } 8 | 9 | useEffect(() => { 10 | document.addEventListener("click", handleClick) 11 | return () => document.removeEventListener("click", handleClick) 12 | }) 13 | } 14 | 15 | export default useOutsideClick -------------------------------------------------------------------------------- /Site/src/components/SearchContext.tsx: -------------------------------------------------------------------------------- 1 | import { FunctionComponent } from 'react' 2 | import { EsCfg } from '../common/Elastic' 3 | import React from 'react' 4 | 5 | export const EsContext = React.createContext(null) 6 | export const EsContextProvider: FunctionComponent<{ esCfg: EsCfg }> = 7 | ({ children, esCfg }) => -------------------------------------------------------------------------------- /Site/src/components/SiteMenu.tsx: -------------------------------------------------------------------------------- 1 | import React, { useContext, FunctionComponent } from 'react' 2 | import styled from 'styled-components' 3 | import logo from '../images/recfluence_word.svg' 4 | import { ytTheme, safeLocation } from './MainLayout' 5 | import { Link } from 'gatsby' 6 | import { Person as IconUser, ExitToApp as IconLogout } from '@styled-icons/material' 7 | import { UserContext } from './UserContext' 8 | import { UserMenu } from './UserMenu' 9 | 10 | const HeaderBar = styled.div` 11 | padding:6px 5px 3px 10px; 12 | display:flex; 13 | width:100%; 14 | background-color:${ytTheme.backColorBolder}; 15 | ` 16 | 17 | const NavStyle = styled.nav` 18 | display:flex; 19 | justify-content:space-between; 20 | align-items:center; /* vertical alignment of child items. I'm crap a googling, or this is a secret */ 21 | width:100%; 22 | 23 | .menu-item { 24 | padding-left: 1em; 25 | } 26 | 27 | .text-links a { 28 | text-transform: uppercase; 29 | } 30 | 31 | .text-links a.active { 32 | color: ${ytTheme.themeColorBolder}; 33 | text-shadow: ${ytTheme.fontThemeShadow}; 34 | } 35 | 36 | /* .icon, .text-icon { 37 | height: 1.7em; 38 | position:relative; 39 | top:0.2em; 40 | } */ 41 | ` 42 | 43 | const LogoStyle = styled.img` 44 | height:30px; 45 | margin: auto 0px 0px 2px; 46 | ` 47 | 48 | interface ShowLoginProps { 49 | showLogin?: boolean 50 | //onLogin?: (user: IdToken) => void 51 | } 52 | 53 | interface HeaderBarProps extends ShowLoginProps { 54 | style?: React.CSSProperties 55 | } 56 | 57 | export const HomeLogo = () => 58 | 59 | export const TopSiteBar = ({ showLogin, style }: HeaderBarProps) => 60 | 61 | 62 | 63 | 64 | export const LinkA: FunctionComponent<{ to: string, className?: string }> = ({ to, children, className }) => { 65 | const active = safeLocation()?.pathname == to 66 | const classes = [className + ' menu-item', active ? 'active' : null].filter(c => c).join(" ") 67 | 68 | // we use A instead of Link because Flows pages is super heavy and navigating causes large javascript slowdowns 69 | return {children} 70 | } 71 | 72 | export const SiteLinks = ({ showLogin }: ShowLoginProps) => { 73 | const userCtx = useContext(UserContext) 74 | const user = userCtx?.user 75 | 76 | return 77 |
78 | Flows 79 | Search 80 |
81 | {showLogin && } 82 |
83 | } -------------------------------------------------------------------------------- /Site/src/components/Spinner.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | import styled from 'styled-components' 3 | import { ytTheme } from './MainLayout' 4 | 5 | const SvgStyle = styled.svg` 6 | background: none; 7 | display: block; 8 | shape-rendering: auto; 9 | margin: 0 auto; 10 | ` 11 | 12 | export const Spinner = ({ size }: { size: string }) => 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Site/src/components/Tag.tsx: -------------------------------------------------------------------------------- 1 | 2 | import React, { useState, PropsWithChildren, MouseEventHandler } from "react" 3 | import styled from 'styled-components' 4 | 5 | const TagStyle = styled.span` 6 | display: inline-block; 7 | background-color: rgb(66, 66, 66); 8 | font-size: 0.9em; 9 | font-weight: bold; 10 | line-height: 1.6; 11 | border-radius: 5px; 12 | padding: 1px 6px; 13 | white-space:nowrap; 14 | ` 15 | 16 | export const Tag = ({ color, label, style }: { color?: string, label: string, style?: React.CSSProperties }) => 17 | {label} -------------------------------------------------------------------------------- /Site/src/components/UserMenu.tsx: -------------------------------------------------------------------------------- 1 | import React, { useContext, FunctionComponent, useState, useRef } from 'react' 2 | import styled from 'styled-components' 3 | import { ExitToApp as IconLogout } from '@styled-icons/material' 4 | import { UserContext } from './UserContext' 5 | import { ytTheme } from './MainLayout' 6 | import useOutsideClick from './OutsideClick' 7 | 8 | 9 | const ProfileIcon = styled.img` 10 | border-radius:50%; 11 | height:25px; 12 | :hover { 13 | cursor: pointer; 14 | } 15 | ` 16 | 17 | const MainDiv = styled.div` 18 | position:relative; 19 | padding:0 5px; 20 | ` 21 | 22 | const UserMenuDiv = styled.div` 23 | position:absolute; 24 | 25 | top: 27px; 26 | right:0px; 27 | z-index:1; 28 | background-color:${ytTheme.backColorBolder}; 29 | box-shadow: 0 0 10px rgba(0, 0, 0, 0.8); 30 | 31 | width:200px; 32 | max-width:90vw; 33 | 34 | div.profile { 35 | text-align:center; 36 | padding:1em; 37 | } 38 | 39 | li { 40 | list-style: none; 41 | 42 | padding:0.5em; 43 | 44 | a { 45 | color: ${ytTheme.fontColor}; 46 | } 47 | 48 | .icon { 49 | height: 1.2em; 50 | position: relative; 51 | top: -0.05em; 52 | padding-right: 0.5em; 53 | } 54 | 55 | :hover { 56 | background-color:${ytTheme.backColorBolder2}; 57 | cursor: pointer; 58 | } 59 | } 60 | ` 61 | 62 | export const UserMenu = () => { 63 | const ctx = useContext(UserContext) 64 | const user = ctx?.user 65 | const [isMenuOpen, setIsMenuOpen] = useState(false) 66 | const ref = useRef() 67 | 68 | useOutsideClick(ref, () => setIsMenuOpen(false)) 69 | 70 | if (!user) return ctx?.logIn()}>Sign In 71 | 72 | return 73 | { 74 | setIsMenuOpen(!isMenuOpen) 75 | }} /> 76 | 77 |
78 |
79 | {user?.name}
80 | {user?.email} 81 |
82 |
    83 |
  • ctx?.logOut()}> Sign Out
  • 84 |
85 |
86 |
87 | } 88 | 89 | -------------------------------------------------------------------------------- /Site/src/components/channel/Channel.tsx: -------------------------------------------------------------------------------- 1 | import React from 'react' 2 | 3 | export const channelUrl = (channelId: string) => `https://www.youtube.com/channel/${channelId}` 4 | 5 | export const ChannelLogo = ({ thumb, channelId, style }: { thumb: string, channelId: string, style?: React.CSSProperties }) => <> 6 | 7 | * { 27 | margin-right:0.3em; 28 | margin-bottom:0.2em; 29 | } 30 | ` 31 | 32 | export const ChannelTags = (props: ChannelComponentProps) => { 33 | const c = props.channel 34 | const tagCol = YtModel.channelDimStatic.col('tags') 35 | const tagLabel = ColEx.labelFunc(tagCol) 36 | const colorFunc = ColEx.colorFunc(tagCol) 37 | return 38 | {/* */} 39 | {c.tags.map(t => )} 40 | 41 | } 42 | 43 | interface ColTagProps { colName: keyof ChannelTagCols, channel: ChannelTagData, style: CSSProperties } 44 | const ColTag: FunctionComponent = p => { 45 | const c = p.channel 46 | const col = dim.col(p.colName) 47 | const labelFunc = ColEx.labelFunc(col) 48 | const colorFunc = ColEx.colorFunc(col) 49 | const val = c[p.colName] 50 | return 51 | } 52 | 53 | export const tagColor = (chartColor: string) => color(chartColor)?.darker(1).hex() -------------------------------------------------------------------------------- /Site/src/pages/404.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { MainLayout, TextPage } from '../components/MainLayout' 3 | 4 | 5 | const P404 = () => ( 6 | 7 | 8 |

Page not found

9 |
10 |
11 | ) 12 | 13 | export default P404 -------------------------------------------------------------------------------- /Site/src/pages/index.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { ChannelRelationsPage } from "../components/channel_relations/ChannelRelationsPage" 3 | import { MainLayout } from "../components/MainLayout" 4 | import { uri, Uri } from '../common/Uri' 5 | import { resultsUrl } from '../common/YtApi' 6 | 7 | 8 | const App = () => ( 9 | 10 | 11 | 12 | ) 13 | 14 | export default App -------------------------------------------------------------------------------- /Site/src/pages/review.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { MainLayout } from '../components/MainLayout' 3 | import { ReviewControl } from '../components/review/Review' 4 | import { TopSiteBar } from '../components/SiteMenu' 5 | 6 | const ReviewPage = () => ( 7 | 8 | 9 | 10 | 11 | ) 12 | export default ReviewPage 13 | 14 | 15 | -------------------------------------------------------------------------------- /Site/src/pages/search.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { VideoSearch } from '../components/search/VideoSearch' 3 | import { MainLayout } from '../components/MainLayout' 4 | import { TopSiteBar } from '../components/SiteMenu' 5 | import { esCfgFromEnv } from '../common/Elastic' 6 | 7 | const SearchPage = () => ( 8 | 9 | 10 | 11 | ) 12 | export default SearchPage 13 | 14 | 15 | -------------------------------------------------------------------------------- /Site/src/pages/video.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react" 2 | import { Router, RouteComponentProps as CP } from "@reach/router" 3 | import { Video } from "../components/search/VideoPage" 4 | import { VideoSearch } from '../components/search/VideoSearch' 5 | import { MainLayout } from '../components/MainLayout' 6 | import { TopSiteBar } from '../components/SiteMenu' 7 | import { esCfgFromEnv } from '../common/Elastic' 8 | 9 | const VideoPage = () => ( 10 | 11 | 12 | 15 | 16 | ) 17 | export default VideoPage 18 | 19 | const BlankPage = (props: CP<{}>) =>
20 | const VideoPageNotFound = (props: CP<{}>) =>

The requested video page couldn't be found

21 | -------------------------------------------------------------------------------- /Site/src/styles/main.css: -------------------------------------------------------------------------------- 1 | * { 2 | margin: 0; 3 | padding: 0; 4 | box-sizing: border-box; 5 | } 6 | 7 | html, 8 | body, 9 | input { 10 | font-size: 14px; 11 | } 12 | 13 | body, 14 | select, 15 | input { 16 | padding: 0; 17 | font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; 18 | background-color: black; 19 | } 20 | 21 | select { 22 | background-color: transparent; 23 | border: none; 24 | outline: none; 25 | } 26 | 27 | html, 28 | div, 29 | ul { 30 | margin: 0px; 31 | padding: 0px; 32 | } 33 | 34 | h1, 35 | h2, 36 | h3, 37 | h4, 38 | p { 39 | margin: 0px; 40 | padding: 0px; 41 | } 42 | 43 | h2 { 44 | font-size: 1.2em; 45 | } 46 | -------------------------------------------------------------------------------- /Site/src/types/NodeTypings.d.ts: -------------------------------------------------------------------------------- 1 | declare namespace NodeJS { 2 | 3 | // Merge the existing `ProcessEnv` definition with ours 4 | // https://www.typescriptlang.org/docs/handbook/declaration-merging.html#merging-interfaces 5 | export interface ProcessEnv { 6 | NODE_ENV: "development" | "production" 7 | RESULTS_HOST: string 8 | RESULTS_CONTAINER: string 9 | RESULTS_PATH: string 10 | FUNC_URL: string 11 | AUTH0_DOMAIN: string 12 | AUTH0_CLIENTID: string 13 | ES_URL: string 14 | BRANCH_ENV: string 15 | } 16 | } -------------------------------------------------------------------------------- /Site/src/types/custom.d.ts: -------------------------------------------------------------------------------- 1 | declare module "*.svg" { 2 | const content: string 3 | export default content 4 | } -------------------------------------------------------------------------------- /Site/src/types/d3fc-label-layout.d.ts: -------------------------------------------------------------------------------- 1 | 2 | declare module 'd3fc-label-layout' { 3 | 4 | export function layoutTextLabel(): LayoutTextLabel 5 | 6 | type SelectionCallable = (selection: d3.Selection, ...args: any[]) => void 7 | 8 | interface LayoutTextLabel { 9 | padding(padding: number): this 10 | value(): string 11 | value(value: ((d: any) => string)): this 12 | } 13 | 14 | interface LayoutStrategy { 15 | 16 | } 17 | 18 | interface LayoutLabel extends SelectionCallable { 19 | size(size: Array | ((data: Datum, index: number, group: Array) => Array)): this 20 | position(position: Array): this 21 | position(position: ((data: any) => Array)): this 22 | component(component: any): this 23 | } 24 | 25 | export function layoutRemoveOverlaps(strategy: LayoutStrategy): LayoutStrategy 26 | 27 | export function layoutGreedy(): LayoutStrategy 28 | 29 | export function layoutLabel(strategy: LayoutStrategy): LayoutLabel 30 | 31 | } -------------------------------------------------------------------------------- /Site/src/types/lucene-query-parser.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'lucene-query-parser' { 2 | export function parse(input: string, options?: any): LuceneTerm | LuceneLeftRight 3 | 4 | 5 | export interface LuceneLeftRight { 6 | left: LuceneTerm | LuceneLeftRight 7 | operator: string 8 | right: LuceneTerm | LuceneLeftRight 9 | } 10 | 11 | export interface LuceneTerm { 12 | field: string 13 | term: string 14 | similarity?: number 15 | boost?: number 16 | prefix?: string 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Site/src/types/react-select-async.d.ts: -------------------------------------------------------------------------------- 1 | // this is a workaround issue with v3 https://github.com/JedWatson/react-select/issues/3592 2 | declare module 'react-select/async' { 3 | import Async from 'react-select/lib/Async' 4 | export * from 'react-select/lib/Async' 5 | export default Async 6 | } 7 | 8 | declare module 'react-select/async-creatable' { 9 | import AsyncCreatable from 'react-select/lib/AsyncCreatable' 10 | export * from 'react-select/lib/AsyncCreatable' 11 | export default AsyncCreatable 12 | } -------------------------------------------------------------------------------- /Site/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Site/static/favicon.ico -------------------------------------------------------------------------------- /Site/static/help/categories_flow_help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Site/static/help/categories_flow_help.png -------------------------------------------------------------------------------- /Site/static/help/ideology_selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Site/static/help/ideology_selection.png -------------------------------------------------------------------------------- /Site/static/help/selection_flow_help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Site/static/help/selection_flow_help.png -------------------------------------------------------------------------------- /Site/static/spinner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/Site/static/spinner.png -------------------------------------------------------------------------------- /Site/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "outDir": "./dist/", 4 | "sourceMap": true, 5 | "noImplicitAny": true, 6 | "module": "commonjs", 7 | "target": "es6", 8 | "jsx": "react", 9 | "allowSyntheticDefaultImports": true, 10 | "keyofStringsOnly": true 11 | }, 12 | "include": [ 13 | "./src/**/*", 14 | ], 15 | "typeRoots": [ 16 | "./src/types", 17 | "node_modules/@types" 18 | ], 19 | "types": [ 20 | "youtube" 21 | ] 22 | } -------------------------------------------------------------------------------- /UserScrape/.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 4 | #------------------------------------------------------------------------------------------------------------- 5 | 6 | # Update the VARIANT arg in devcontainer.json to pick a Python version: 3, 3.8, 3.7, 3.6 7 | # To fully customize the contents of this image, use the following Dockerfile instead: 8 | # https://github.com/microsoft/vscode-dev-containers/tree/v0.112.0/containers/python-3/.devcontainer/base.Dockerfile 9 | ARG VARIANT="3.8" 10 | FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT} 11 | ENV DEBIAN_FRONTEND=noninteractive 12 | 13 | RUN apt-get update \ 14 | && apt-get -y install --no-install-recommends \ 15 | chromium-driver curl unzip wget bzip2 16 | 17 | 18 | RUN GECKODRIVER_VERSION=`curl https://github.com/mozilla/geckodriver/releases/latest | grep -Po 'v[0-9]+.[0-9]+.[0-9]+'` && \ 19 | wget https://github.com/mozilla/geckodriver/releases/download/$GECKODRIVER_VERSION/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz && \ 20 | tar -zxf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz -C /usr/local/bin && \ 21 | chmod +x /usr/local/bin/geckodriver && \ 22 | rm geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz 23 | 24 | RUN FIREFOX_SETUP=firefox-setup.tar.bz2 && \ 25 | apt-get purge firefox && \ 26 | wget -O $FIREFOX_SETUP "https://download.mozilla.org/?product=firefox-latest&os=linux64" && \ 27 | tar xjf $FIREFOX_SETUP -C /opt/ && \ 28 | ln -s /opt/firefox/firefox /usr/bin/firefox && \ 29 | rm $FIREFOX_SETUP && \ 30 | apt-get -y install libdbus-glib-1-2 31 | 32 | # Clean up 33 | RUN apt-get autoremove -y \ 34 | && apt-get clean -y \ 35 | && rm -rf /var/lib/apt/lists/* 36 | 37 | ENV DEBIAN_FRONTEND=dialog 38 | -------------------------------------------------------------------------------- /UserScrape/.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.112.0/containers/python-3 3 | { 4 | "name": "Python 3", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "context": "..", 8 | // Update 'VARIANT' to pick a Python version. Rebuild the container 9 | // if it already exists to update. Available variants: 3, 3.6, 3.7, 3.8 10 | "args": { 11 | "VARIANT": "3" 12 | } 13 | }, 14 | // Set *default* container specific settings.json values on container create. 15 | "settings": { 16 | "terminal.integrated.shell.linux": "/bin/bash", 17 | "python.pythonPath": "/usr/local/bin/python", 18 | "python.linting.enabled": true, 19 | "python.linting.pylintEnabled": true, 20 | "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", 21 | "python.formatting.blackPath": "/usr/local/py-utils/bin/black", 22 | "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", 23 | "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", 24 | "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", 25 | "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", 26 | "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", 27 | "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", 28 | "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint", 29 | "python.testing.pytestPath": "/usr/local/py-utils/bin/pytest" 30 | }, 31 | // Add the IDs of extensions you want installed when the container is created. 32 | "extensions": [ 33 | "ms-python.python", 34 | "Azurite.azurite", 35 | "ms-azuretools.vscode-azurestorage", 36 | "formulahendry.code-runner", 37 | "njpwerner.autodocstring", 38 | "ms-python.vscode-pylance" 39 | ], 40 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 41 | // "forwardPorts": [], 42 | // Use 'postCreateCommand' to run commands after the container is created. 43 | "postCreateCommand": "pip3 install --user -r requirements.txt", 44 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. 45 | // "remoteUser": "vscode" 46 | } -------------------------------------------------------------------------------- /UserScrape/.dockerignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | **/.classpath 3 | **/.dockerignore 4 | **/.env 5 | **/.git 6 | **/.gitignore 7 | **/.project 8 | **/.settings 9 | **/.toolstarget 10 | **/.vs 11 | **/.vscode 12 | **/*.*proj.user 13 | **/*.dbmdl 14 | **/*.jfm 15 | **/azds.yaml 16 | **/bin 17 | **/charts 18 | **/docker-compose* 19 | **/Dockerfile* 20 | **/node_modules 21 | **/npm-debug.log 22 | **/obj 23 | **/secrets.dev.yaml 24 | **/values.dev.yaml 25 | README.md -------------------------------------------------------------------------------- /UserScrape/.gitignore: -------------------------------------------------------------------------------- 1 | userscrape.json 2 | __azurite_db_blob*.json 3 | __*__ -------------------------------------------------------------------------------- /UserScrape/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations. 3 | // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp 4 | // List of extensions which should be recommended for users of this workspace. 5 | "recommendations": [ 6 | "Azurite.azurite", 7 | "ms-azuretools.vscode-azurestorage", 8 | "ms-python.python", 9 | "njpwerner.autodocstring" 10 | ], 11 | // List of extensions recommended by VS Code that should not be recommended for users of this workspace. 12 | "unwantedRecommendations": [] 13 | } -------------------------------------------------------------------------------- /UserScrape/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "start experiment (trial)", 5 | "args": [ 6 | "-t", 7 | "${input:pickTrial}" 8 | ], 9 | "type": "python", 10 | "request": "launch", 11 | "program": "app.py", 12 | "console": "integratedTerminal", 13 | }, 14 | { 15 | "name": "start experiment (accounts)", 16 | "args": [ 17 | "-a", 18 | "${input:pickAccounts}" 19 | ], 20 | "type": "python", 21 | "request": "launch", 22 | "program": "app.py", 23 | "console": "integratedTerminal", 24 | }, 25 | { 26 | "name": "start experiment", 27 | "type": "python", 28 | "request": "launch", 29 | "program": "app.py", 30 | "console": "integratedTerminal", 31 | }, 32 | { 33 | "name": "Python: current file", 34 | "type": "python", 35 | "request": "launch", 36 | "program": "${file}", 37 | "console": "integratedTerminal" 38 | } 39 | ], 40 | "inputs": [ 41 | { 42 | "id": "pickTrial", 43 | "description": "Enter a trail ID (blank for none)", 44 | "type": "promptString" 45 | }, 46 | { 47 | "id": "pickAccounts", 48 | "description": "Enter | seperated account names (e.g. MRA, AntiSJW)", 49 | "type": "promptString" 50 | } 51 | ] 52 | } -------------------------------------------------------------------------------- /UserScrape/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "Identitarian", 4 | "aiohttp", 5 | "appcfg", 6 | "asdict", 7 | "asyncio", 8 | "creds", 9 | "dotenv", 10 | "getenv", 11 | "groupby", 12 | "iterrows", 13 | "jdbc", 14 | "jsonl", 15 | "jsons", 16 | "nargs", 17 | "sesh", 18 | "spawnl", 19 | "strftime", 20 | "userscrape", 21 | "vids", 22 | "visibles" 23 | ], 24 | "python.linting.pylintEnabled": true, 25 | "python.linting.enabled": true, 26 | "json.schemas": [ 27 | { 28 | "fileMatch": [ 29 | "userscrape.json" 30 | ], 31 | "url": "/userscrape.schema.json" 32 | } 33 | ], 34 | "files.exclude": { 35 | "**/.git": true, 36 | "**/.svn": true, 37 | "**/.hg": true, 38 | "**/CVS": true, 39 | "**/.DS_Store": true, 40 | "**/__*__": true, 41 | "__azurite_*.json": true, 42 | }, 43 | "editor.formatOnSave": true 44 | } -------------------------------------------------------------------------------- /UserScrape/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | { 4 | "type": "docker-build", 5 | "label": "docker-build", 6 | "platform": "python", 7 | "dockerBuild": { 8 | "tag": "ytnetworks.azurecr.io/userscrape:latest", 9 | "dockerfile": "${workspaceFolder}/Dockerfile", 10 | "context": "${workspaceFolder}" 11 | }, 12 | "group": { 13 | "kind": "build", 14 | "isDefault": true 15 | } 16 | }, 17 | { 18 | "label": "run docker -it", 19 | "type": "shell", 20 | "command": "docker", 21 | "args": [ 22 | "run", 23 | "-it", 24 | "--env-file", 25 | ".env", 26 | "ytnetworks.azurecr.io/userscrape:latest" 27 | ], 28 | "problemMatcher": [] 29 | } 30 | ], 31 | } -------------------------------------------------------------------------------- /UserScrape/Dockerfile: -------------------------------------------------------------------------------- 1 | # For more information, please refer to https://aka.ms/vscode-docker-python 2 | FROM python:3.8-slim 3 | 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | 6 | RUN apt-get update \ 7 | && apt-get -y install --no-install-recommends \ 8 | curl unzip wget bzip2 chromium-driver 9 | 10 | RUN GECKODRIVER_VERSION=`curl https://github.com/mozilla/geckodriver/releases/latest | grep -Po 'v[0-9]+.[0-9]+.[0-9]+'` && \ 11 | wget https://github.com/mozilla/geckodriver/releases/download/$GECKODRIVER_VERSION/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz && \ 12 | tar -zxf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz -C /usr/local/bin && \ 13 | chmod +x /usr/local/bin/geckodriver && \ 14 | rm geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz 15 | 16 | RUN FIREFOX_SETUP=firefox-setup.tar.bz2 && \ 17 | apt-get purge firefox && \ 18 | wget -O $FIREFOX_SETUP "https://download.mozilla.org/?product=firefox-latest&os=linux64" && \ 19 | tar xjf $FIREFOX_SETUP -C /opt/ && \ 20 | ln -s /opt/firefox/firefox /usr/bin/firefox && \ 21 | rm $FIREFOX_SETUP && \ 22 | apt-get -y install libdbus-glib-1-2 23 | 24 | # Clean up 25 | RUN apt-get autoremove -y \ 26 | && apt-get clean -y \ 27 | && apt-get remove curl unzip wget bzip2 -y \ 28 | && rm -rf /var/lib/apt/lists/* 29 | 30 | ENV DEBIAN_FRONTEND=dialog 31 | 32 | # upgrade pip 33 | RUN pip install --upgrade pip 34 | 35 | # Keeps Python from generating .pyc files in the container 36 | ENV PYTHONDONTWRITEBYTECODE 1 37 | 38 | # Turns off buffering for easier container logging 39 | ENV PYTHONUNBUFFERED 1 40 | 41 | # Install pip requirements 42 | ADD requirements.txt . 43 | RUN python -m pip install -r requirements.txt 44 | 45 | WORKDIR /app 46 | ADD . /app 47 | 48 | # During debugging, this entry point will be overridden. For more information, refer to https://aka.ms/vscode-docker-python-debug 49 | CMD ["python", "app.py"] -------------------------------------------------------------------------------- /UserScrape/cfg_generalte_schema.py: -------------------------------------------------------------------------------- 1 | import json 2 | from userscrape.cfg import Cfg 3 | 4 | with open('userscrape.schema.json', "w") as w: 5 | jSchema = Cfg.json_schema() 6 | schemaTxt = json.dumps(jSchema, indent=' ') 7 | w.write(schemaTxt) 8 | -------------------------------------------------------------------------------- /UserScrape/requirements.txt: -------------------------------------------------------------------------------- 1 | selenium 2 | azure-storage-blob 3 | python-dotenv 4 | dataclasses 5 | dataclasses_json 6 | dataclasses_jsonschema 7 | pandas 8 | discord.py 9 | seqlog 10 | shortuuid 11 | aiohttp 12 | more-itertools -------------------------------------------------------------------------------- /UserScrape/sandbox.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from typing import List 4 | from userscrape.cfg import load_cfg, UserCfg, Cfg 5 | from userscrape.crawler import Crawler 6 | import asyncio 7 | from userscrape.store import BlobStore, new_trial_id 8 | from azure.storage.blob import PublicAccess 9 | import logging 10 | from userscrape.log import configure_log 11 | import os 12 | import sys 13 | 14 | 15 | async def setup_test_crawler() -> Crawler: 16 | cfg: Cfg = await load_cfg() 17 | store = BlobStore(cfg.store) 18 | store.ensure_container_exits(PublicAccess.Container) 19 | user = cfg.users[0] 20 | trial_id = new_trial_id() 21 | log = configure_log(cfg.seqUrl, os.getenv('env'), cfg.branch_env, trial_id) 22 | crawler = Crawler(store, None, user, cfg, trial_id, log) 23 | return crawler 24 | 25 | 26 | async def test_log(): 27 | cfg: Cfg = await load_cfg() 28 | log = configure_log(cfg.seqUrl, os.getenv('env'), cfg.branch_env, 'logest') 29 | log.debug("debug 1") 30 | await asyncio.sleep(2) 31 | log.debug("debug 2") 32 | try: 33 | raise EnvironmentError() 34 | except EnvironmentError as ex: 35 | log.error("unhandled environment error", exc_info=True) 36 | 37 | log.debug("debug 3") 38 | logging.shutdown() 39 | sys.exit(1) 40 | 41 | 42 | async def test_watch(videos: List[str]): 43 | crawler: Crawler = await setup_test_crawler() 44 | for video in videos: 45 | await crawler.get_recommendations_for_video(video) 46 | await crawler.watch_videos(videos) 47 | 48 | asyncio.run(test_watch([ 49 | 'SmOl2EcpdKg' # normal 50 | # 'rBu0BRTx2x8', # invavailable in AU 51 | # 'Ms9WOSXU5tY' # members only 52 | ])) # hYx2t-iEZu0 53 | # asyncio.run(test_log()) 54 | -------------------------------------------------------------------------------- /UserScrape/tox.ini: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | max_line_length = 120 3 | -------------------------------------------------------------------------------- /UserScrape/userscrape/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/UserScrape/userscrape/__init__.py -------------------------------------------------------------------------------- /UserScrape/userscrape/format.py: -------------------------------------------------------------------------------- 1 | def format_seconds(seconds): 2 | seconds = int(seconds) 3 | days, seconds = divmod(seconds, 86400) 4 | hours, seconds = divmod(seconds, 3600) 5 | minutes, seconds = divmod(seconds, 60) 6 | if days > 0: 7 | return '%dd %dh %dm %ds' % (days, hours, minutes, seconds) 8 | elif hours > 0: 9 | return '%dh %dm %ds' % (hours, minutes, seconds) 10 | elif minutes > 0: 11 | return '%dm %ds' % (minutes, seconds) 12 | else: 13 | return '%ds' % (seconds,) 14 | -------------------------------------------------------------------------------- /UserScrape/userscrape/log.py: -------------------------------------------------------------------------------- 1 | import seqlog 2 | import logging 3 | import json 4 | 5 | 6 | def configure_log(url: str, env='dev', branch_env=None, trial_id=None) -> logging.Logger: 7 | seqlog.configure_from_dict({ 8 | 'version': 1, 9 | 'disable_existing_loggers': True, 10 | 'root': { 11 | 'level': 'WARN', 12 | 'handlers': ['console'] 13 | }, 14 | 'loggers': { 15 | 'seq': { 16 | 'level': 'DEBUG', 17 | 'handlers': ['seq'], 18 | 'propagate': True 19 | } 20 | }, 21 | 'handlers': { 22 | 'console': { 23 | 'class': 'seqlog.structured_logging.ConsoleStructuredLogHandler', 24 | 'formatter': 'seq' 25 | }, 26 | 'seq': { 27 | 'class': 'seqlog.structured_logging.SeqLogHandler', 28 | 'server_url': url, 29 | 'batch_size': 10, 30 | 'auto_flush_timeout': 2, 31 | 'formatter': 'seq' 32 | } 33 | }, 34 | 'formatters': { 35 | 'seq': { 36 | 'style': '{' 37 | } 38 | } 39 | }) 40 | 41 | seqlog.set_global_log_properties( 42 | app="UserScrape", 43 | env=env, 44 | branch_env=branch_env, 45 | trial_id=trial_id 46 | ) 47 | 48 | return logging.getLogger('seq') 49 | -------------------------------------------------------------------------------- /UserScrape/userscrape/results.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from typing import Any, Iterable, Iterator, List, Optional 4 | from pathlib import Path, PurePath, PurePosixPath 5 | import tempfile 6 | import json 7 | import os 8 | import shortuuid 9 | from logging import Logger 10 | from .store import BlobPaths, BlobStore, file_date_str 11 | from dataclasses import dataclass 12 | from dataclasses_jsonschema import JsonSchemaMixin 13 | 14 | 15 | @dataclass 16 | class TrialCfg(JsonSchemaMixin): 17 | trial_id: str 18 | accounts: Optional[List[str]] 19 | 20 | 21 | def load_incomplete_trial(trial_id: str, store: BlobStore, log: Logger) -> TrialCfg: 22 | paths = BlobPaths(store.cfg, trial_id) 23 | p = paths.trial_incomplete_json() 24 | if not store.exists(p): 25 | return None 26 | return TrialCfg.from_json(store.load(p)) 27 | 28 | 29 | def save_incomplete_trial(trial_cfg: TrialCfg, store: BlobStore, log: Logger): 30 | path = BlobPaths(store.cfg, trial_cfg.trial_id) 31 | store.save(path.trial_incomplete_json(), trial_cfg.to_json()) 32 | 33 | 34 | def save_complete_trial(trial_id: str, store: BlobStore, log: Logger): 35 | 36 | def save_complete_jsons(source: PurePath, dest: PurePath): 37 | source_Files = store.list(source) 38 | file_name = f'{file_date_str()}.{shortuuid.random(4)}.jsonl' 39 | localPath = Path(tempfile.gettempdir(), 'UserScrape', file_name) 40 | localPath.parent.mkdir(parents=True, exist_ok=True) 41 | dest_file = dest / file_name 42 | with open(localPath, "w", encoding="utf-8") as w: 43 | for file in source_Files: 44 | d = store.load_dic(PurePath(file.name)) 45 | w.write(json.dumps(d, indent=None) + '\n') 46 | store.save_file(localPath, dest_file) 47 | os.remove(localPath) 48 | log.info('saved completed trial {trial_id} results to {dest_file}', 49 | trial_id=trial_id, dest_file=dest_file.as_posix()) 50 | 51 | path = BlobPaths(store.cfg, trial_id) 52 | 53 | store.delete(path.trial_incomplete_json()) 54 | res_path = path.results_path_out() 55 | save_complete_jsons(path.rec_path(), res_path / 'rec') 56 | save_complete_jsons(path.feed_path(), res_path / 'feed') 57 | save_complete_jsons(path.ad_path(), res_path / 'ad') 58 | save_complete_jsons(path.watch_time_path(), res_path / 'watch') 59 | -------------------------------------------------------------------------------- /chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markledwich2/Recfluence/d64c2790ebba8389a017b63fcdd139eba2a7a07b/chromedriver --------------------------------------------------------------------------------