From 7ad46f9d95976d7b5eba6f0ef73e99b26a1b1a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:36:28 +0100 Subject: [PATCH 01/22] feat: replication changes for the segmentRepositories table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../V1753798345__segmentRepositories_replication.sql | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 backend/src/database/migrations/V1753798345__segmentRepositories_replication.sql diff --git a/backend/src/database/migrations/V1753798345__segmentRepositories_replication.sql b/backend/src/database/migrations/V1753798345__segmentRepositories_replication.sql new file mode 100644 index 0000000000..98f940db95 --- /dev/null +++ b/backend/src/database/migrations/V1753798345__segmentRepositories_replication.sql @@ -0,0 +1,3 @@ +ALTER PUBLICATION sequin_pub ADD TABLE "segmentRepositories"; +ALTER TABLE "segmentRepositories" REPLICA IDENTITY FULL; +GRANT SELECT ON "segmentRepositories" to sequin; From 0fb2a8c63bcc9450e2f93e7abb5c1230efc4d666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:36:56 +0100 Subject: [PATCH 02/22] feat: create the segmentRepositories data source in TinyBird MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../segmentRepositories.datasource | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 services/libs/tinybird/datasources/segmentRepositories.datasource diff --git a/services/libs/tinybird/datasources/segmentRepositories.datasource b/services/libs/tinybird/datasources/segmentRepositories.datasource new file mode 100644 index 0000000000..f6e06540c1 --- /dev/null +++ b/services/libs/tinybird/datasources/segmentRepositories.datasource @@ -0,0 +1,26 @@ +DESCRIPTION > + - `segmentRepositories` contains the repository data associated with segments. + - Replicated from Postgres - it is meant to become the single source of truth about repositories in the future. + - Schema: + - `repository` is the URL for the repository. + - `segmentId` links to the segment the repository belongs to. + - `insightsProjectId` links to the insightsProject the repository belongs to. + - `createdAt` is a standard timestamp field to record lifecycle tracking. + - `archived` indicates whether the repository is archived (true/false). + - `excluded` indicates whether the repository is excluded from analytics and metrics (true/false). + - `last_archived_check` is the timestamp of the last check for whether the repository is archived. + +SCHEMA > + `repository` String `json:$.record.repository`, + `segmentId` UUID `json:$.record.segmentId`, + `insightsProjectId` UUID `json:$.record.insightsProjectId`, + `createdAt` DateTime64(3) `json:$.record.createdAt`, + `archived` Bool `json:$.record.archived`, + `excluded` Bool `json:$.record.excluded`, + `last_archived_check` Nullable(DateTime64(3)) `json:$.record.last_archived_check` + + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY repository +ENGINE_VER createdAt From f1ae4c60269ffedc4e4f75e3fbc269fc0ae876a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Tue, 12 Aug 2025 19:53:51 +0100 Subject: [PATCH 03/22] feat: add the archived and excluded columns to the projects in TinyBird MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../insights_projects_populated_ds.datasource | 4 ++++ .../pipes/insightsProjects_filtered.pipe | 2 ++ .../insights_projects_populated_copy.pipe | 22 ++++++++++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource b/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource index 0143ff3fea..f5e8b4f3fb 100644 --- a/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource +++ b/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource @@ -9,6 +9,8 @@ DESCRIPTION > - `logoUrl`, `organizationId`, `website`, `github`, `linkedin`, `twitter` contain project branding and social links. - `widgets` array defines which insights widgets are enabled for this project. - `repositories` array contains the list of repository URLs associated with the project. + - `archivedRepositories` contains a list of archived repositories for the project. + - `excludedRepositories` contains a list of excluded repositories for the project. - `enabled` and `isLF` are flags for project status and Linux Foundation association (UInt8 boolean). - `keywords` array contains searchable keywords and tags for the project. - `collectionsSlugs` array contains slugs of collections this project belongs to. @@ -38,6 +40,8 @@ SCHEMA > `twitter` String, `widgets` Array(String), `repositories` Array(String), + `archivedRepositories` Array(String), + `excludedRepositories` Array(String), `enabled` UInt8, `isLF` UInt8, `keywords` Array(String), diff --git a/services/libs/tinybird/pipes/insightsProjects_filtered.pipe b/services/libs/tinybird/pipes/insightsProjects_filtered.pipe index 643bee5d13..f43cac97f1 100644 --- a/services/libs/tinybird/pipes/insightsProjects_filtered.pipe +++ b/services/libs/tinybird/pipes/insightsProjects_filtered.pipe @@ -14,6 +14,8 @@ SQL > insights_projects_populated_ds.description, insights_projects_populated_ds.logoUrl as logo, insights_projects_populated_ds.repositories, + insights_projects_populated_ds.archivedRepositories, + insights_projects_populated_ds.excludedRepositories, insights_projects_populated_ds.isLF, insights_projects_populated_ds.widgets, insights_projects_populated_ds.keywords, diff --git a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe index 4030ab797d..155f08b5c6 100644 --- a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe +++ b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe @@ -107,6 +107,21 @@ NODE insights_projects_populated_copy_health_score_deduplicated SQL > SELECT id, overallScore as healthScore FROM health_score_copy_ds +NODE archived_excluded_repositories +DESCRIPTION > + Get the archived and excluded repositories for each segment and insightsProject + +SQL > + SELECT + segmentId, + insightsProjectId, + groupArrayIf(repository, archived = true) AS "archivedRepositories", + groupArrayIf(repository, excluded = true) AS "excludedRepositories" + FROM segmentRepositories FINAL + WHERE + archived = true OR excluded = true + GROUP BY segmentId, insightsProjectId + NODE insights_projects_populated_copy_results DESCRIPTION > Join everything together @@ -140,7 +155,9 @@ SQL > insights_projects_populated_copy_aggregates.softwareValue as softwareValue, insights_projects_populated_copy_aggregates.contributorCount as contributorCount, insights_projects_populated_copy_aggregates.organizationCount as organizationCount, - insights_projects_populated_copy_health_score_deduplicated.healthScore as healthScore + insights_projects_populated_copy_health_score_deduplicated.healthScore as healthScore, + archived_excluded_repositories.archivedRepositories as archivedRepositories, + archived_excluded_repositories.excludedRepositories as excludedRepositories FROM insightsProjects FINAL LEFT JOIN insights_projects_populated_copy_collections_slugs @@ -160,6 +177,9 @@ SQL > LEFT JOIN insights_projects_populated_copy_health_score_deduplicated ON insights_projects_populated_copy_health_score_deduplicated.id = insightsProjects.id + LEFT JOIN + archived_excluded_repositories + ON archived_repositories.insightsProjectId = insightsProjects.id WHERE isNull (insightsProjects.deletedAt) TYPE COPY From bc183d94c2db0c077a3dd25f71bc1ee5d519cb11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Tue, 12 Aug 2025 23:25:17 +0100 Subject: [PATCH 04/22] feat: add the archived project column to the search endpoint in TinyBird MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../libs/tinybird/pipes/search_collections_projects_repos.pipe | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/libs/tinybird/pipes/search_collections_projects_repos.pipe b/services/libs/tinybird/pipes/search_collections_projects_repos.pipe index d7693a90d8..19385eece1 100644 --- a/services/libs/tinybird/pipes/search_collections_projects_repos.pipe +++ b/services/libs/tinybird/pipes/search_collections_projects_repos.pipe @@ -30,7 +30,8 @@ SQL > insightsProjects_filtered.slug, insightsProjects_filtered.logo, insightsProjects_filtered.slug as "projectSlug", - insightsProjects_filtered.name + insightsProjects_filtered.name, + insightsProjects_filtered.archivedRepositories from insightsProjects_filtered where not ( From 307cd4560f2e7119ed274ffbcaa4ae3e47ea7bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Tue, 12 Aug 2025 23:33:47 +0100 Subject: [PATCH 05/22] chore: fix node name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../libs/tinybird/pipes/insights_projects_populated_copy.pipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe index 155f08b5c6..d65b464d9b 100644 --- a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe +++ b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe @@ -179,7 +179,7 @@ SQL > ON insights_projects_populated_copy_health_score_deduplicated.id = insightsProjects.id LEFT JOIN archived_excluded_repositories - ON archived_repositories.insightsProjectId = insightsProjects.id + ON archived_excluded_repositories.insightsProjectId = insightsProjects.id WHERE isNull (insightsProjects.deletedAt) TYPE COPY From e6b2939c13954d2b6710e1da9e9a415042528f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:28:29 +0100 Subject: [PATCH 06/22] chore: change archived and excluded flags in the search pipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../tinybird/pipes/search_collections_projects_repos.pipe | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/services/libs/tinybird/pipes/search_collections_projects_repos.pipe b/services/libs/tinybird/pipes/search_collections_projects_repos.pipe index 19385eece1..c3498b8f22 100644 --- a/services/libs/tinybird/pipes/search_collections_projects_repos.pipe +++ b/services/libs/tinybird/pipes/search_collections_projects_repos.pipe @@ -30,8 +30,7 @@ SQL > insightsProjects_filtered.slug, insightsProjects_filtered.logo, insightsProjects_filtered.slug as "projectSlug", - insightsProjects_filtered.name, - insightsProjects_filtered.archivedRepositories + insightsProjects_filtered.name from insightsProjects_filtered where not ( @@ -46,7 +45,10 @@ SQL > activityRepositories_filtered.repo as slug, null as logo, activityRepositories_filtered.projectSlug as "projectSlug", - null as name + null as name, + sr.archived as archived, + sr.excluded as excluded from activityRepositories_filtered + join segmentRepositories as sr on sr.insightsProjectId = activityRepositories_filtered.projectId order by activityRepositories_filtered.repo asc limit {{ Integer(limit, 10, description="Limit number of records for each type", required=False) }} From 3666ea4000b59a81380d1176296e5513aab3a5bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Sat, 16 Aug 2025 12:29:51 +0100 Subject: [PATCH 07/22] chore: add updatedAt column to segmentRepositories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../U1755343323__add_updatedAt_to_segmentRepositories.sql | 2 ++ .../V1755343323__add_updatedAt_to_segmentRepositories.sql | 2 ++ .../libs/tinybird/datasources/segmentRepositories.datasource | 4 +--- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql create mode 100644 backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql diff --git a/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql b/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql new file mode 100644 index 0000000000..55fbcf4187 --- /dev/null +++ b/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql @@ -0,0 +1,2 @@ +ALTER TABLE segmentRepositories +DROP COLUMN updatedAt; diff --git a/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql b/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql new file mode 100644 index 0000000000..7c69d18990 --- /dev/null +++ b/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql @@ -0,0 +1,2 @@ +ALTER TABLE segmentRepositories +ADD COLUMN updatedAt TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP; \ No newline at end of file diff --git a/services/libs/tinybird/datasources/segmentRepositories.datasource b/services/libs/tinybird/datasources/segmentRepositories.datasource index f6e06540c1..417dccc0af 100644 --- a/services/libs/tinybird/datasources/segmentRepositories.datasource +++ b/services/libs/tinybird/datasources/segmentRepositories.datasource @@ -6,9 +6,7 @@ DESCRIPTION > - `segmentId` links to the segment the repository belongs to. - `insightsProjectId` links to the insightsProject the repository belongs to. - `createdAt` is a standard timestamp field to record lifecycle tracking. - - `archived` indicates whether the repository is archived (true/false). - - `excluded` indicates whether the repository is excluded from analytics and metrics (true/false). - - `last_archived_check` is the timestamp of the last check for whether the repository is archived. + - `updatedAt` is a standard timestamp field to record lifecycle tracking. SCHEMA > `repository` String `json:$.record.repository`, From f6e780ecd2b30d8e5bdab3b0716a35dbe407baec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Sat, 16 Aug 2025 12:31:00 +0100 Subject: [PATCH 08/22] chore: clarification about the purpose of the archived and excluded columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../datasources/insights_projects_populated_ds.datasource | 4 ++-- .../libs/tinybird/datasources/segmentRepositories.datasource | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource b/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource index f5e8b4f3fb..8fe3823047 100644 --- a/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource +++ b/services/libs/tinybird/datasources/insights_projects_populated_ds.datasource @@ -9,8 +9,8 @@ DESCRIPTION > - `logoUrl`, `organizationId`, `website`, `github`, `linkedin`, `twitter` contain project branding and social links. - `widgets` array defines which insights widgets are enabled for this project. - `repositories` array contains the list of repository URLs associated with the project. - - `archivedRepositories` contains a list of archived repositories for the project. - - `excludedRepositories` contains a list of excluded repositories for the project. + - `archivedRepositories` a list of archived repositories for the project, which is used to show archived repos in the frontend. + - `excludedRepositories` a list of excluded repositories for the project; excluded repos are not used in health score and security metrics calculations. - `enabled` and `isLF` are flags for project status and Linux Foundation association (UInt8 boolean). - `keywords` array contains searchable keywords and tags for the project. - `collectionsSlugs` array contains slugs of collections this project belongs to. diff --git a/services/libs/tinybird/datasources/segmentRepositories.datasource b/services/libs/tinybird/datasources/segmentRepositories.datasource index 417dccc0af..b77e2e6df8 100644 --- a/services/libs/tinybird/datasources/segmentRepositories.datasource +++ b/services/libs/tinybird/datasources/segmentRepositories.datasource @@ -5,6 +5,9 @@ DESCRIPTION > - `repository` is the URL for the repository. - `segmentId` links to the segment the repository belongs to. - `insightsProjectId` links to the insightsProject the repository belongs to. + - `archived` indicates whether the repository is archived (true/false). + - `excluded` indicates whether the repository is excluded from analytics and metrics (true/false). + - `last_archived_check` is the timestamp of the last check for whether the repository is archived. - `createdAt` is a standard timestamp field to record lifecycle tracking. - `updatedAt` is a standard timestamp field to record lifecycle tracking. From 3b1bdbfca450526bc259a01fadd358172ec92d83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Sat, 16 Aug 2025 12:42:35 +0100 Subject: [PATCH 09/22] chore: add updatedAt column to segmentRepositories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../U1755343323__add_updatedAt_to_segmentRepositories.sql | 4 ++-- .../V1755343323__add_updatedAt_to_segmentRepositories.sql | 4 ++-- .../libs/tinybird/datasources/segmentRepositories.datasource | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql b/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql index 55fbcf4187..0c395d153f 100644 --- a/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql +++ b/backend/src/database/migrations/U1755343323__add_updatedAt_to_segmentRepositories.sql @@ -1,2 +1,2 @@ -ALTER TABLE segmentRepositories -DROP COLUMN updatedAt; +ALTER TABLE "segmentRepositories" +DROP COLUMN updated_at; diff --git a/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql b/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql index 7c69d18990..82dc2b06f1 100644 --- a/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql +++ b/backend/src/database/migrations/V1755343323__add_updatedAt_to_segmentRepositories.sql @@ -1,2 +1,2 @@ -ALTER TABLE segmentRepositories -ADD COLUMN updatedAt TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP; \ No newline at end of file +ALTER TABLE "segmentRepositories" +ADD COLUMN updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP; \ No newline at end of file diff --git a/services/libs/tinybird/datasources/segmentRepositories.datasource b/services/libs/tinybird/datasources/segmentRepositories.datasource index b77e2e6df8..f454713b5c 100644 --- a/services/libs/tinybird/datasources/segmentRepositories.datasource +++ b/services/libs/tinybird/datasources/segmentRepositories.datasource @@ -15,10 +15,11 @@ SCHEMA > `repository` String `json:$.record.repository`, `segmentId` UUID `json:$.record.segmentId`, `insightsProjectId` UUID `json:$.record.insightsProjectId`, - `createdAt` DateTime64(3) `json:$.record.createdAt`, `archived` Bool `json:$.record.archived`, `excluded` Bool `json:$.record.excluded`, - `last_archived_check` Nullable(DateTime64(3)) `json:$.record.last_archived_check` + `last_archived_check` Nullable(DateTime64(3)) `json:$.record.last_archived_check`, + `createdAt` DateTime64(3) `json:$.record.createdAt`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree From f7f5eb74d1ff9bcf77d9942710f737797f3d70d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Thu, 28 Aug 2025 16:48:36 +0100 Subject: [PATCH 10/22] chore: fix regression in column name after rebase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- services/cronjobs/archived_repositories/src/database.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/cronjobs/archived_repositories/src/database.ts b/services/cronjobs/archived_repositories/src/database.ts index 89e5c124e2..1a892ef207 100644 --- a/services/cronjobs/archived_repositories/src/database.ts +++ b/services/cronjobs/archived_repositories/src/database.ts @@ -62,7 +62,7 @@ export async function updateRepositoryStatus( try { await client.query( `UPDATE "segmentRepositories" - SET archived = $1, excluded = $2, last_archived_check = NOW(), updatedAt = NOW() + SET archived = $1, excluded = $2, last_archived_check = NOW(), updated_at = NOW() WHERE repository = $3`, [isArchived, isExcluded, repository] ); From 5c3d58588b0e46ccd4025cfd1c7bae751d72b550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Mon, 1 Sep 2025 16:19:03 +0100 Subject: [PATCH 11/22] fix: change segmentRepositories.datasource ENGINE_VER to updatedAt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../libs/tinybird/datasources/segmentRepositories.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/segmentRepositories.datasource b/services/libs/tinybird/datasources/segmentRepositories.datasource index f454713b5c..36b0db4584 100644 --- a/services/libs/tinybird/datasources/segmentRepositories.datasource +++ b/services/libs/tinybird/datasources/segmentRepositories.datasource @@ -25,4 +25,4 @@ SCHEMA > ENGINE ReplacingMergeTree ENGINE_PARTITION_KEY toYear(createdAt) ENGINE_SORTING_KEY repository -ENGINE_VER createdAt +ENGINE_VER updatedAt From 2ff7ee8d206a50c70aa0a3274a6c1811427f842f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:18:15 +0100 Subject: [PATCH 12/22] chore: update pnpm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- services/cronjobs/archived_repositories/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/cronjobs/archived_repositories/Dockerfile b/services/cronjobs/archived_repositories/Dockerfile index 349c95bb47..88fd7b7651 100644 --- a/services/cronjobs/archived_repositories/Dockerfile +++ b/services/cronjobs/archived_repositories/Dockerfile @@ -1,5 +1,5 @@ ARG NODE_VERSION=24 -ARG PNPM_VERSION=10.14.0 +ARG PNPM_VERSION=10.15.0 ARG PNPM_CACHE_DIR=/tmp/archived-repositories-cronjob-pnpm-cache ## Builder stage From 428ef4505d73ef62480aa0d4872d2d6c03482877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:18:44 +0100 Subject: [PATCH 13/22] fix: fix parsing some Gitlab repository URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../src/clients/github.ts | 16 +++++++++-- .../src/clients/gitlab.ts | 13 ++++++--- .../archived_repositories/src/main.ts | 22 ++++++++++----- .../archived_repositories/src/types.ts | 6 ----- .../archived_repositories/src/utils.ts | 27 ------------------- .../archived_repositories/src/workers.ts | 15 +++++------ 6 files changed, 46 insertions(+), 53 deletions(-) delete mode 100644 services/cronjobs/archived_repositories/src/utils.ts diff --git a/services/cronjobs/archived_repositories/src/clients/github.ts b/services/cronjobs/archived_repositories/src/clients/github.ts index 2478aef651..626d1f545a 100644 --- a/services/cronjobs/archived_repositories/src/clients/github.ts +++ b/services/cronjobs/archived_repositories/src/clients/github.ts @@ -1,9 +1,21 @@ import { ofetch } from 'ofetch'; import { Config } from "../config"; -export async function isGitHubRepoArchived(owner: string, repo: string, config: Config): Promise { +export async function isGitHubRepoArchived(url: string, config: Config): Promise { + const parsed = new URL(url); + const parts = parsed.pathname.split('/').filter(Boolean); + + if (parts.length < 2) { + throw new Error(`Invalid GitHub repository URL: ${url}`); + } + + const [owner, repo] = parts; + const data = await ofetch(`https://api.github.com/repos/${owner}/${repo}`, { - headers: { Authorization: `Bearer ${config.GithubToken}` }, + headers: { + Authorization: `Bearer ${config.GithubToken}`, + Accept: 'application/vnd.github+json', + }, }); return data.archived; diff --git a/services/cronjobs/archived_repositories/src/clients/gitlab.ts b/services/cronjobs/archived_repositories/src/clients/gitlab.ts index 077251dd80..d71fea8aef 100644 --- a/services/cronjobs/archived_repositories/src/clients/gitlab.ts +++ b/services/cronjobs/archived_repositories/src/clients/gitlab.ts @@ -1,9 +1,16 @@ import { ofetch } from 'ofetch'; import { Config } from "../config"; -export async function isGitLabRepoArchived(owner: string, repo: string, config: Config): Promise { - const projectPath = encodeURIComponent(`${owner}/${repo}`); - const data = await ofetch(`https://gitlab.com/api/v4/projects/${projectPath}`, { +export async function isGitLabRepoArchived(url: string, config: Config): Promise { + const parsed = new URL(url); + const projectPath = parsed.pathname.split('/').filter(Boolean).join('/'); + + if (!projectPath) { + throw new Error(`Invalid GitLab repository URL: ${url}`); + } + + const encodedProjectPath = encodeURIComponent(projectPath); + const data = await ofetch(`https://gitlab.com/api/v4/projects/${encodedProjectPath}`, { headers: { 'PRIVATE-TOKEN': config.GitlabToken }, }); diff --git a/services/cronjobs/archived_repositories/src/main.ts b/services/cronjobs/archived_repositories/src/main.ts index dbe3d1d9f8..c4433a5472 100644 --- a/services/cronjobs/archived_repositories/src/main.ts +++ b/services/cronjobs/archived_repositories/src/main.ts @@ -1,9 +1,8 @@ import { Queue } from 'bullmq'; import { getConfig, Config } from './config.js'; import { closeConnection, fetchRepositoryUrls } from './database.js'; -import { parseRepoURL } from './utils'; import { GITHUB_QUEUE_NAME, GITLAB_QUEUE_NAME } from './types'; -import { JobData, ParsedRepoInfo, Platform } from './types.js'; +import { JobData, Platform } from './types.js'; function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); @@ -85,20 +84,29 @@ function prepareJobsByPlatform(repoURLs: string[]): { githubJobs: JobData[]; git const githubJobs: JobData[] = []; const gitlabJobs: JobData[] = []; - let parsedResult: ParsedRepoInfo; repoURLs.forEach((url) => { + let platform: Platform; + try { - parsedResult = parseRepoURL(url); + const parsed = new URL(url); + + if (parsed.hostname === 'github.com') { + platform = Platform.GITHUB; + } else if (parsed.hostname === 'gitlab.com') { + platform = Platform.GITLAB; + } else { + throw new Error(`Unsupported platform for URL: ${url}`); + } } catch (error) { console.warn(`Skipping URL due to error: ${error}`); return; } const jobData = { - name: `${parsedResult.platform}-repo-${parsedResult.owner}-${parsedResult.repo}`, + name: `${platform}-repo-${url.replace(/[^a-zA-Z0-9]/g, '-')}`, data: { url, - platform: parsedResult.platform, + platform, }, opts: { removeOnComplete: 1000, @@ -106,7 +114,7 @@ function prepareJobsByPlatform(repoURLs: string[]): { githubJobs: JobData[]; git } }; - switch (parsedResult.platform) { + switch (platform) { case Platform.GITHUB: githubJobs.push(jobData); break; diff --git a/services/cronjobs/archived_repositories/src/types.ts b/services/cronjobs/archived_repositories/src/types.ts index 48cee9f3d9..1f941990c3 100644 --- a/services/cronjobs/archived_repositories/src/types.ts +++ b/services/cronjobs/archived_repositories/src/types.ts @@ -6,12 +6,6 @@ export enum Platform { GITLAB = 'gitlab', } -export interface ParsedRepoInfo { - platform: Platform.GITHUB | Platform.GITLAB; - owner: string; - repo: string; -} - export interface JobData { name: string; data: { diff --git a/services/cronjobs/archived_repositories/src/utils.ts b/services/cronjobs/archived_repositories/src/utils.ts deleted file mode 100644 index fbf29ce069..0000000000 --- a/services/cronjobs/archived_repositories/src/utils.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { Platform } from "./types"; -import type { ParsedRepoInfo } from "./types"; - -/* - * Parses a repository URL and returns the platform, owner, and repo name. - * This expects URLs in the format that are typical for GitHub and GitLab, - * e.g.: https://github.com/linuxfoundation/insights - * If in the future we need to support more platforms, we might need to revisit this. - */ -export function parseRepoURL(url: string): ParsedRepoInfo { - const parsed = new URL(url); - const parts = parsed.pathname.split('/').filter(Boolean); - - if (parts.length < 2) { - throw new Error(`Invalid repository URL: ${url}`); - } - - const [owner, repo] = parts; - - if (parsed.hostname === 'github.com') { - return { platform: Platform.GITHUB, owner, repo }; - } else if (parsed.hostname === 'gitlab.com') { - return { platform: Platform.GITLAB, owner, repo }; - } - - throw new Error(`Unsupported platform for URL: ${url}`); -} diff --git a/services/cronjobs/archived_repositories/src/workers.ts b/services/cronjobs/archived_repositories/src/workers.ts index ea3c3449d8..c651d9c686 100644 --- a/services/cronjobs/archived_repositories/src/workers.ts +++ b/services/cronjobs/archived_repositories/src/workers.ts @@ -2,7 +2,6 @@ import { Worker, Job, WorkerOptions } from 'bullmq'; import { getConfig } from './config.js'; import { isGitHubRepoArchived } from './clients/github'; import { isGitLabRepoArchived } from './clients/gitlab'; -import { parseRepoURL } from "./utils"; import { GITHUB_QUEUE_NAME, GITLAB_QUEUE_NAME, Platform } from './types'; import { updateRepositoryStatus } from "./database"; @@ -13,24 +12,24 @@ async function handleJob(job: Job) { throw new Error('Job data must contain a valid URL'); } - const parseResult = parseRepoURL(job.data.url); - let archived, excluded; switch (job.data.platform) { case Platform.GITHUB: - console.log(`Processing GitHub repo: ${parseResult.owner}/${parseResult.repo}`); - archived = excluded = await isGitHubRepoArchived(parseResult.owner, parseResult.repo, config); + console.log(`Processing GitHub repo: ${job.data.url}`); + archived = excluded = await isGitHubRepoArchived(job.data.url, config); // .github repositories should always be excluded from calculations, regardless of whether they are archived. - if (parseResult.repo === '.github') { + const parsed = new URL(job.data.url); + const parts = parsed.pathname.split('/').filter(Boolean); + if (parts.length >= 2 && parts[1] === '.github') { console.log(`Skipping .github repository: ${job.data.url}`); excluded = true; } break; case Platform.GITLAB: - console.log(`Processing GitLab repo: ${parseResult.owner}/${parseResult.repo}`); - archived = excluded = await isGitLabRepoArchived(parseResult.owner, parseResult.repo, config); + console.log(`Processing GitLab repo: ${job.data.url}`); + archived = excluded = await isGitLabRepoArchived(job.data.url, config); break; default: throw new Error(`Unsupported platform: ${job.data.platform}`); From d5f6fd15e8d91978497f2fcd3b61ed84011e8a85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Fri, 29 Aug 2025 23:58:52 +0100 Subject: [PATCH 14/22] fix: fix offset when fetching repositories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../cronjobs/archived_repositories/src/database.ts | 13 +++++++++---- .../cronjobs/archived_repositories/src/main.ts | 4 +++- .../cronjobs/archived_repositories/src/workers.ts | 14 +++++++++----- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/services/cronjobs/archived_repositories/src/database.ts b/services/cronjobs/archived_repositories/src/database.ts index 1a892ef207..21a984db0a 100644 --- a/services/cronjobs/archived_repositories/src/database.ts +++ b/services/cronjobs/archived_repositories/src/database.ts @@ -27,21 +27,26 @@ function getPool(config: Config): Pool { return pool; } -export async function fetchRepositoryUrls(batchSize: number, config: Config): Promise { +export async function fetchRepositoryUrls(batchSize: number, offset: number, config: Config): Promise { // Ensure that batchSize is a positive integer if (batchSize <= 0) { throw new Error('Invalid batch size. Please provide a positive integer.'); } + // Ensure that offset is a non-negative integer + if (offset < 0) { + throw new Error('Invalid offset. Please provide a non-negative integer.'); + } + const client = getPool(config); try { const result = await client.query( `SELECT repository FROM "segmentRepositories" - WHERE last_archived_check IS NULL OR last_archived_check < NOW() - INTERVAL \'5 days\' + WHERE (last_archived_check IS NULL OR last_archived_check < NOW() - INTERVAL \'5 days\' ORDER BY last_archived_check - LIMIT $1`, - [batchSize] + LIMIT $1 OFFSET $2`, + [batchSize, offset] ); return result.rows.map(row => row.repository); diff --git a/services/cronjobs/archived_repositories/src/main.ts b/services/cronjobs/archived_repositories/src/main.ts index c4433a5472..e6e0b46912 100644 --- a/services/cronjobs/archived_repositories/src/main.ts +++ b/services/cronjobs/archived_repositories/src/main.ts @@ -11,6 +11,7 @@ function sleep(ms: number): Promise { async function main(config: Config) { let totalProcessed = 0; let batchNumber = 1; + let offset = 0; const queueOptions = { connection: { url: config.RedisUrl }, @@ -39,7 +40,7 @@ async function main(config: Config) { while (true) { console.log(`Processing batch ${batchNumber}...`); - const repoURLs = await fetchRepositoryUrls(config.BatchSize, config); + const repoURLs = await fetchRepositoryUrls(config.BatchSize, offset, config); if (repoURLs.length === 0) { console.log(`No more repositories found. Total processed: ${totalProcessed} repositories.`); @@ -75,6 +76,7 @@ async function main(config: Config) { await sleep(config.BatchDelayMs); batchNumber++; + offset += repoURLs.length; } await closeConnection(); diff --git a/services/cronjobs/archived_repositories/src/workers.ts b/services/cronjobs/archived_repositories/src/workers.ts index c651d9c686..739eb06ed9 100644 --- a/services/cronjobs/archived_repositories/src/workers.ts +++ b/services/cronjobs/archived_repositories/src/workers.ts @@ -1,7 +1,7 @@ import { Worker, Job, WorkerOptions } from 'bullmq'; import { getConfig } from './config.js'; -import { isGitHubRepoArchived } from './clients/github'; -import { isGitLabRepoArchived } from './clients/gitlab'; +import { getGithubRepoStatus } from './clients/github'; +import { getGitlabRepoStatus } from './clients/gitlab'; import { GITHUB_QUEUE_NAME, GITLAB_QUEUE_NAME, Platform } from './types'; import { updateRepositoryStatus } from "./database"; @@ -16,20 +16,24 @@ async function handleJob(job: Job) { switch (job.data.platform) { case Platform.GITHUB: console.log(`Processing GitHub repo: ${job.data.url}`); - archived = excluded = await isGitHubRepoArchived(job.data.url, config); + const githubStatus = await getGithubRepoStatus(job.data.url, config); + archived = githubStatus.archived; + excluded = githubStatus.excluded; // .github repositories should always be excluded from calculations, regardless of whether they are archived. const parsed = new URL(job.data.url); const parts = parsed.pathname.split('/').filter(Boolean); if (parts.length >= 2 && parts[1] === '.github') { - console.log(`Skipping .github repository: ${job.data.url}`); + console.log(`Forcefully marking .github repository as excluded: ${job.data.url}`); excluded = true; } break; case Platform.GITLAB: console.log(`Processing GitLab repo: ${job.data.url}`); - archived = excluded = await isGitLabRepoArchived(job.data.url, config); + const gitlabStatus = await getGitlabRepoStatus(job.data.url, config); + archived = gitlabStatus.archived; + excluded = gitlabStatus.excluded; break; default: throw new Error(`Unsupported platform: ${job.data.platform}`); From 3f852a513fbc8efc432c802e53fcc95389e1ac6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Fri, 29 Aug 2025 23:59:15 +0100 Subject: [PATCH 15/22] chore: mark 404 and 403 as excluded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../src/clients/github.ts | 34 ++++++++++++++----- .../src/clients/gitlab.ts | 29 +++++++++++++--- .../archived_repositories/src/types.ts | 5 +++ 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/services/cronjobs/archived_repositories/src/clients/github.ts b/services/cronjobs/archived_repositories/src/clients/github.ts index 626d1f545a..e930afd596 100644 --- a/services/cronjobs/archived_repositories/src/clients/github.ts +++ b/services/cronjobs/archived_repositories/src/clients/github.ts @@ -1,7 +1,8 @@ import { ofetch } from 'ofetch'; import { Config } from "../config"; +import { RepositoryStatus } from "../types"; -export async function isGitHubRepoArchived(url: string, config: Config): Promise { +export async function getGithubRepoStatus(url: string, config: Config): Promise { const parsed = new URL(url); const parts = parsed.pathname.split('/').filter(Boolean); @@ -11,12 +12,29 @@ export async function isGitHubRepoArchived(url: string, config: Config): Promise const [owner, repo] = parts; - const data = await ofetch(`https://api.github.com/repos/${owner}/${repo}`, { - headers: { - Authorization: `Bearer ${config.GithubToken}`, - Accept: 'application/vnd.github+json', - }, - }); + try { + const data = await ofetch(`https://api.github.com/repos/${owner}/${repo}`, { + headers: { + Authorization: `Bearer ${config.GithubToken}`, + Accept: 'application/vnd.github+json', + }, + }); - return data.archived; + return { + archived: data.archived, + excluded: false + }; + } catch (error: any) { + // Handle 404 (not found) and 403 (forbidden) as excluded repositories + if (error?.status === 404 || error?.status === 403) { + console.log(`GitHub repo not accessible (${error.status}): ${url} - marking as excluded`); + return { + archived: false, + excluded: true + }; + } + + // Re-throw other errors to maintain existing error handling + throw error; + } } diff --git a/services/cronjobs/archived_repositories/src/clients/gitlab.ts b/services/cronjobs/archived_repositories/src/clients/gitlab.ts index d71fea8aef..7ef30fc4b5 100644 --- a/services/cronjobs/archived_repositories/src/clients/gitlab.ts +++ b/services/cronjobs/archived_repositories/src/clients/gitlab.ts @@ -1,7 +1,8 @@ import { ofetch } from 'ofetch'; import { Config } from "../config"; +import { RepositoryStatus } from "../types"; -export async function isGitLabRepoArchived(url: string, config: Config): Promise { +export async function getGitlabRepoStatus(url: string, config: Config): Promise { const parsed = new URL(url); const projectPath = parsed.pathname.split('/').filter(Boolean).join('/'); @@ -10,9 +11,27 @@ export async function isGitLabRepoArchived(url: string, config: Config): Promise } const encodedProjectPath = encodeURIComponent(projectPath); - const data = await ofetch(`https://gitlab.com/api/v4/projects/${encodedProjectPath}`, { - headers: { 'PRIVATE-TOKEN': config.GitlabToken }, - }); - return data.archived; + try { + const data = await ofetch(`https://gitlab.com/api/v4/projects/${encodedProjectPath}`, { + headers: { 'PRIVATE-TOKEN': config.GitlabToken }, + }); + + return { + archived: data.archived, + excluded: false + }; + } catch (error: any) { + // Handle 404 (not found) and 403 (forbidden) as excluded repositories + if (error?.status === 404 || error?.status === 403) { + console.log(`GitLab repo not accessible (${error.status}): ${url} - marking as excluded`); + return { + archived: false, + excluded: true + }; + } + + // Re-throw other errors to maintain existing error handling + throw error; + } } diff --git a/services/cronjobs/archived_repositories/src/types.ts b/services/cronjobs/archived_repositories/src/types.ts index 1f941990c3..d5f4e94d4f 100644 --- a/services/cronjobs/archived_repositories/src/types.ts +++ b/services/cronjobs/archived_repositories/src/types.ts @@ -6,6 +6,11 @@ export enum Platform { GITLAB = 'gitlab', } +export interface RepositoryStatus { + archived: boolean; + excluded: boolean; +} + export interface JobData { name: string; data: { From 21a292daf0e62b64d06b470f468b9a514fe0d9d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Sat, 30 Aug 2025 00:07:16 +0100 Subject: [PATCH 16/22] fix: fix SQL error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- services/cronjobs/archived_repositories/src/database.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/cronjobs/archived_repositories/src/database.ts b/services/cronjobs/archived_repositories/src/database.ts index 21a984db0a..8c8ec33685 100644 --- a/services/cronjobs/archived_repositories/src/database.ts +++ b/services/cronjobs/archived_repositories/src/database.ts @@ -43,7 +43,7 @@ export async function fetchRepositoryUrls(batchSize: number, offset: number, con try { const result = await client.query( `SELECT repository FROM "segmentRepositories" - WHERE (last_archived_check IS NULL OR last_archived_check < NOW() - INTERVAL \'5 days\' + WHERE last_archived_check IS NULL OR last_archived_check < NOW() - INTERVAL \'5 days\' ORDER BY last_archived_check LIMIT $1 OFFSET $2`, [batchSize, offset] From 5aa4c04c172d2529ae48635ae1b4857d542ed89a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Sat, 30 Aug 2025 23:12:31 +0100 Subject: [PATCH 17/22] chore: recheck repositories every 3 days, not 5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- services/cronjobs/archived_repositories/src/database.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/cronjobs/archived_repositories/src/database.ts b/services/cronjobs/archived_repositories/src/database.ts index 8c8ec33685..3929caf0d7 100644 --- a/services/cronjobs/archived_repositories/src/database.ts +++ b/services/cronjobs/archived_repositories/src/database.ts @@ -43,7 +43,7 @@ export async function fetchRepositoryUrls(batchSize: number, offset: number, con try { const result = await client.query( `SELECT repository FROM "segmentRepositories" - WHERE last_archived_check IS NULL OR last_archived_check < NOW() - INTERVAL \'5 days\' + WHERE last_archived_check IS NULL OR last_archived_check < NOW() - INTERVAL \'3 days\' ORDER BY last_archived_check LIMIT $1 OFFSET $2`, [batchSize, offset] From b8fa46d33d363527f7132ad9461f797335913e90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Mon, 1 Sep 2025 16:31:41 +0100 Subject: [PATCH 18/22] fix: fix segmentRepositories.datasource formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../libs/tinybird/datasources/segmentRepositories.datasource | 1 - 1 file changed, 1 deletion(-) diff --git a/services/libs/tinybird/datasources/segmentRepositories.datasource b/services/libs/tinybird/datasources/segmentRepositories.datasource index 36b0db4584..08eaa11789 100644 --- a/services/libs/tinybird/datasources/segmentRepositories.datasource +++ b/services/libs/tinybird/datasources/segmentRepositories.datasource @@ -21,7 +21,6 @@ SCHEMA > `createdAt` DateTime64(3) `json:$.record.createdAt`, `updatedAt` DateTime64(3) `json:$.record.updated_at` - ENGINE ReplacingMergeTree ENGINE_PARTITION_KEY toYear(createdAt) ENGINE_SORTING_KEY repository From 4008d0eb1cb18eed1d6a88527303e1efa93b915f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Mon, 1 Sep 2025 16:47:19 +0100 Subject: [PATCH 19/22] fix: fix insights_projects_populated_copy.pipe formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../libs/tinybird/pipes/insights_projects_populated_copy.pipe | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe index d65b464d9b..5fd81a5c21 100644 --- a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe +++ b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe @@ -118,8 +118,7 @@ SQL > groupArrayIf(repository, archived = true) AS "archivedRepositories", groupArrayIf(repository, excluded = true) AS "excludedRepositories" FROM segmentRepositories FINAL - WHERE - archived = true OR excluded = true + WHERE archived = true OR excluded = true GROUP BY segmentId, insightsProjectId NODE insights_projects_populated_copy_results From 5a6bd810be5bd413c687cfa891f03e4a31feef5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Mon, 1 Sep 2025 16:57:59 +0100 Subject: [PATCH 20/22] fix: fix insightsProjects_filtered.pipe SQL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- services/libs/tinybird/pipes/insightsProjects_filtered.pipe | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/libs/tinybird/pipes/insightsProjects_filtered.pipe b/services/libs/tinybird/pipes/insightsProjects_filtered.pipe index f43cac97f1..75ff11afcc 100644 --- a/services/libs/tinybird/pipes/insightsProjects_filtered.pipe +++ b/services/libs/tinybird/pipes/insightsProjects_filtered.pipe @@ -75,6 +75,8 @@ SQL > insights_projects_populated_ds.description, insights_projects_populated_ds.logoUrl as logo, insights_projects_populated_ds.repositories, + insights_projects_populated_ds.archivedRepositories, + insights_projects_populated_ds.excludedRepositories, insights_projects_populated_ds.isLF, insights_projects_populated_ds.widgets, insights_projects_populated_ds.keywords, From 1d7cd18c11962ab4d187a21fdf2ad9f1eb57fc10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:35:10 +0100 Subject: [PATCH 21/22] fix: fix insights_projects_populated_copy.pipe SQL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../libs/tinybird/pipes/insights_projects_populated_copy.pipe | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe index 5fd81a5c21..702e84a677 100644 --- a/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe +++ b/services/libs/tinybird/pipes/insights_projects_populated_copy.pipe @@ -114,7 +114,7 @@ DESCRIPTION > SQL > SELECT segmentId, - insightsProjectId, + toString(insightsProjectId) AS insightsProjectId, groupArrayIf(repository, archived = true) AS "archivedRepositories", groupArrayIf(repository, excluded = true) AS "excludedRepositories" FROM segmentRepositories FINAL From 0c2b86c5095a57d38c7f7dc55ac24683e5ac05fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Santos?= <4837+borfast@users.noreply.github.com> Date: Mon, 1 Sep 2025 18:36:07 +0100 Subject: [PATCH 22/22] fix: fix search_collections_projects_repos.pipe SQL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Raúl Santos <4837+borfast@users.noreply.github.com> --- .../tinybird/pipes/search_collections_projects_repos.pipe | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/services/libs/tinybird/pipes/search_collections_projects_repos.pipe b/services/libs/tinybird/pipes/search_collections_projects_repos.pipe index c3498b8f22..c224aded6d 100644 --- a/services/libs/tinybird/pipes/search_collections_projects_repos.pipe +++ b/services/libs/tinybird/pipes/search_collections_projects_repos.pipe @@ -20,7 +20,9 @@ SQL > collections_filtered.slug, null as logo, null as projectSlug, - collections_filtered.name + collections_filtered.name, + CAST(NULL AS Nullable(UInt8)) as archived, + CAST(NULL AS Nullable(UInt8)) as excluded from collections_filtered order by collections_filtered.projectCount desc limit {{ Integer(limit, 10, description="Limit number of records for each type", required=False) }} @@ -30,7 +32,9 @@ SQL > insightsProjects_filtered.slug, insightsProjects_filtered.logo, insightsProjects_filtered.slug as "projectSlug", - insightsProjects_filtered.name + insightsProjects_filtered.name, + CAST(NULL AS Nullable(UInt8)) as archived, + CAST(NULL AS Nullable(UInt8)) as excluded from insightsProjects_filtered where not (