Glittr stats

In this report you can find some general statistics about Glittr.org. The plots and statistics created are amongst others used in the manuscript. Since Glittr.org is an ongoing project these statistics are updated weekly.

Set up the environment

This is required if you run this notebook locally. Loading required packages.

To run locally, create a file named .env and add your GitHub PAT (variable named PAT ) and google api key (named GOOGLE_API_KEY) in there, e.g.:

# this is an example, store it as .env:
export PAT="ghp_aRSRESCTZII20Lklser3H"
export GOOGLE_API_KEY="AjKSLE5SklxuRsxwPP8s0"

Now use your UNIX terminal to source this file to get the keys as objects:

source .env

In R, get environment variables as objects:

pat <- Sys.getenv("PAT")
google_api_key <- Sys.getenv("GOOGLE_API_KEY")
matomo_api_key <- Sys.getenv("MATOMO_API_KEY")

Setting colors. These correspond to the category colours on glittr.org.

glittr_cols <- c(
  "Scripting and languages" =             "#3a86ff",
  "Computational methods and pipelines" = "#fb5607",
  "Omics analysis" =                      "#ff006e",
  "Reproducibility and data management" = "#ffbe0b",
  "Statistics and machine learning" =     "#8338ec",
  "Others" =                              "#000000")

Parse repository data

Using the glittr.org REST API to get repository metadata, among which the stargazers, recency, category, license and tags.

Code
# while loop to loop over all pages
page_list <- list()
page <- 1
has_more_pages <- TRUE

while (has_more_pages) {
  # Create and send the request with pagination
  response <- request("https://glittr.org/api/repositories") |>
    req_url_query(`page[size]` = 100, `page[number]` = page) |>
    req_perform() |>
    resp_body_json()
  
  # Append the data to the list
  page_list <- append(page_list, response$data)
  
  # Check if there are more pages (this logic depends on the API's response structure)
  has_more_pages <- length(response$data) > 0  # Adjust this condition based on your API's pagination logic
  page <- page + 1
}

# extract relevant items as dataframe
repo_info_list <- lapply(page_list, function(x) data.frame(
  repo = x$name,
  author_name = x$author$name,
  stargazers = x$stargazers,
  recency = x$days_since_last_push,
  url = x$url,
  license = ifelse(is.null(x$license), "none", x$license),
  main_tag = x$tags[[1]]$name,
  main_category = x$tags[[1]]$category,
  website = x$website,
  author_profile = x$author$profile,
  author_website = x$author$website
))

repo_info <- do.call(rbind, repo_info_list)

# create a column with provider (either github or gitlab)
repo_info$provider <- ifelse(grepl("github", repo_info$url), "github", "gitlab")

# create a factor for categories for sorting
repo_info$main_category <- factor(repo_info$main_category,
                                  levels = names(glittr_cols))

# category table to keep order the same in the plots
cat_table <- table(category = repo_info$main_category)
cat_table <- sort(cat_table)

Number of repositories: 672

Get contributors info

Using the GitHub REST API to get the number of contributors for each repository on glittr.org. This takes a few minutes, so if the contributors haven’t changed, it will use a cached version.

Code
# take long time to run, so try to use cache results if no repos have been 
# added in the meantime

# check if data/n_contributors.rds exists
if(file.exists("data/n_contributors.rds")) {
  n_contributors <- readRDS("data/n_contributors.rds")
} else {
  n_contributors <- NULL
}

# get contributors info only from github repos
repo_info_gh <- repo_info[repo_info$provider == "github", ]

# get contributor info from github api if update is needed
if(!identical(sort(repo_info_gh$repo), sort(names(n_contributors)))) {
  dir.create("data", showWarnings = FALSE)
  n_contributors <- sapply(repo_info_gh$repo, function(x) {
    
    # get repo contributors
    resp <- request("https://api.github.com/repos/") |>
      req_url_path_append(x) |>
      req_url_path_append("contributors") |>
      req_url_query(per_page = 1) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() 
    
    link_url <- resp_link_url(resp, "last")
    if(is.null(link_url)) {
      return(1)
    } else {
      npages <- strsplit(link_url, "&page=")[[1]][2] |> as.numeric()
      return(npages)
    }
  })
  
  # overwrite rds file
  saveRDS(n_contributors, "data/n_contributors.rds")
}

repo_info_gh$contributors <- n_contributors[repo_info_gh$repo]

Get country information

Here we get country information for all authors and organizations. It uses the free text specified at ‘location’. Since this can be anything, we use the google REST API to translate that into country.

Code
# check whether author info exists for caching
if(file.exists("data/author_info.rds")) {
  author_info <- readRDS("data/author_info.rds")
  author_info_authors <- unique(author_info$author) |> sort()
} else {
  author_info_authors <- NULL
}

gh_authors <- repo_info$author_name[repo_info$provider == "github"] |>
  unique() |>
  sort()

# if the author info is out of date, update it
if(!identical(gh_authors, author_info_authors)) {
  author_info_list <- list()
  for(author in gh_authors) {
    
    parsed <- request("https://api.github.com/users/") |>
      req_url_path_append(author) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() |>
      resp_body_json()
    
    author_info_list[[author]] <- data.frame(
      author = parsed$login,
      type = parsed$type,
      name = ifelse(is.null(parsed$name), NA, parsed$name),
      location = ifelse(is.null(parsed$location), NA, parsed$location)
    )
  }
  
  author_info <- do.call(rbind, author_info_list)
  
  author_info_loc <- author_info[!is.na(author_info$location), ]
  
  author_loc <- author_info_loc$location
  names(author_loc) <- author_info_loc$author
  
  ggmap::register_google(key = google_api_key)
  loc_info <- ggmap::geocode(author_loc,
                             output = 'all')
  
  get_country <- function(loc_results) {
    if("results" %in% names(loc_results)) {
      for(results in loc_results$results) {
        address_info <- results$address_components |> 
          lapply(unlist) |> 
          do.call(rbind, args = _) |>
          as.data.frame()
        country <- address_info$long_name[address_info$types1 == "country"]
        if (length(country) == 0) next
      }
      if (length(country) == 0) return(NA)
      return(country)
    } else {
      return(NA)
    }
  }
  
  countries <- sapply(loc_info, get_country)
  names(countries) <- names(author_loc)
  
  author_info$country <- countries[author_info$author]
  
  saveRDS(author_info, "data/author_info.rds")
}

repo_info <- merge(repo_info, author_info, by.x = "author_name",
                   by.y = "author")
repo_info$country[is.na(repo_info$country)] <- "undefined"
  • Number of authors: 319
  • Number of countries: 26

Parse tag data

Here, we create tag_df that contains information for each tag by using the glittr.org API.

parsed <- request("https://glittr.org/api/tags") |>
  req_perform() |>
  resp_body_json()

tag_dfs <- list()
for(i in seq_along(parsed)) {
  category <- parsed[[i]]$category
  name <- sapply(parsed[[i]]$tags, function(x) x$name)
  repositories <- sapply(parsed[[i]]$tags, function(x) x$repositories)
  tag_dfs[[category]] <- data.frame(name, category, repositories)
}

tag_df <- do.call(rbind, tag_dfs) |> arrange(repositories)

Number of tags/topics: 61

Number of repositories by category

This is figure 2A in the manuscript.

cat_count_plot <- table(category = repo_info$main_category) |>
  as.data.frame() |>
  ggplot(aes(x = reorder(category, Freq), y = Freq, fill = category)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = glittr_cols) +
  coord_flip() +
  theme_classic() +
  ggtitle("Categories") +
  theme(legend.position = "none",
        axis.title.y = element_blank()) +
  ylab("Number of repositories")

print(cat_count_plot)
Figure 1: Number of repositories per category

And a table with the actual numbers

category_count <- table(category = repo_info$main_category) |> as.data.frame()
knitr::kable(category_count)
Table 1: Number of repositories per category
category Freq
Scripting and languages 311
Computational methods and pipelines 48
Omics analysis 154
Reproducibility and data management 45
Statistics and machine learning 83
Others 29

Number of contributors per repository separated by category

This is figure 2B in the manuscript.

repo_info_gh$main_category <- factor(repo_info_gh$main_category,
                                     levels = names(cat_table))

contributors_plot <- repo_info_gh |>
  ggplot(aes(x = main_category, y = contributors, fill = main_category)) +
  geom_violin(scale = "width") +
  geom_boxplot(width = 0.1, col = "darkgrey") +
  coord_flip() +
  ggtitle("Contributors") +
  ylab("Number of contributors") +
  scale_y_sqrt() +
  scale_fill_manual(values = glittr_cols) +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_blank(),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10))

print(contributors_plot)
Figure 2: Number of contributors per repository separated by category

And some statistics of contributors.

nna_contr <- repo_info_gh$contributors
param1 <- sum(nna_contr > 10)/length(nna_contr)
param2 <- sum(nna_contr > 1)/length(nna_contr)
param3 <- sum(nna_contr <= 5)/length(nna_contr)
  • More than 10 contributors: 25.2%
  • More than 1 contributor: 80.1%
  • Between 1 and 5 contributors: 60%

Number of repositories per tag

This is figure 2C in the manuscript.

tag_freq_plot <- tag_df |>
  filter(repositories > 10) |>
  ggplot(aes(x = reorder(name, repositories),
             y = repositories, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Tags with > 10 repositories") +
  ylab("Number of repositories") +
  annotate(geom = "text", x = 2, y = 150,
           label = paste("Total number of tags: ",
                         nrow(tag_df)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(tag_freq_plot)
Figure 3: Number of repostories per tag, colored by category.

And a table with the actual numbers.

tag_df |>
  filter(repositories > 10) |>
  arrange(desc(repositories)) |>
  knitr::kable(row.names = FALSE)
Table 2: Number of repositories per tag
name category repositories
R Scripting and languages 267
Python Scripting and languages 104
Transcriptomics Omics analysis 88
RNA-seq Omics analysis 83
Next generation sequencing Omics analysis 64
Statistics Statistics and machine learning 61
Data science Statistics and machine learning 55
Genomics Omics analysis 53
Machine learning Statistics and machine learning 51
Single-cell sequencing Omics analysis 50
Data management Reproducibility and data management 42
Unix/Linux Scripting and languages 40
Reproducibility Reproducibility and data management 39
Data visualization Scripting and languages 36
FAIR data Reproducibility and data management 35
General Others 34
Variant analysis Omics analysis 30
Version control Scripting and languages 27
Workflows Computational methods and pipelines 21
Shiny Scripting and languages 20
Containerization Computational methods and pipelines 20
Metagenomics Omics analysis 18
Docker Computational methods and pipelines 16
ChIP-seq Omics analysis 16
Julia Scripting and languages 13
Nextflow Computational methods and pipelines 12
ATAC-seq Omics analysis 12
Epigenetics Omics analysis 12
Quarto Scripting and languages 11
Image analysis Computational methods and pipelines 11

Number of repositories by author

This is figure 2D in the manuscript.

author_freq <- table(author_name = repo_info$author_name, 
                     main_category = repo_info$main_category) |>
  as.data.frame()

author_freq$main_category <- factor(author_freq$main_category,
                                     levels = names(cat_table))

repos_per_author <- table(repo_info$author_name)

lf_authors <- names(repos_per_author)[repos_per_author < 5]

author_freq_plot <- author_freq |>
  filter(!author_name %in% lf_authors) |>
  arrange(Freq) |>
  ggplot(aes(x = reorder(author_name, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Author or organization") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 30,
           label = paste("Authors with < 5 repos: ",
                         length(lf_authors)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(author_freq_plot)
Figure 4: Number of repositories per author colored by category

And a table with the actual numbers.

table(repo_info$author_name) |>
  as.data.frame() |>
  filter(Freq >= 5) |>
  arrange(desc(Freq)) |>
  knitr::kable()
Table 3: Number of repositories per author
Var1 Freq
carpentries-incubator 46
sib-swiss 28
NBISweden 22
posit-conf-2023 20
ucdavis-bioinformatics-training 18
hbctraining 17
posit-conf-2024 17
datacarpentry 16
bioinformaticsdotca 14
bioinformatics-core-shared-training 13
bioinformatics-ca 10
GTPB 10
fhdsl 9
rstudio-conf-2022 9
learnbyexample 8
RockefellerUniversity 7
semacu 7
biocorecrg 6
JuliaAcademy 6
swcarpentry 6
cambiotraining 5
carpentries-lab 5
hadley 5
jhudsl 5

Number of repositories per license

This is figure 2E in the manuscript.

lic_freq_data <- table(license = repo_info$license,
                       main_category = repo_info$main_category) |>
  as.data.frame()

lic_freq_data$main_category <- factor(lic_freq_data$main_category,
                                     levels = names(cat_table))

lic_freq_plot <- lic_freq_data |>
  ggplot(aes(x = reorder(license, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  theme_classic() +
  ggtitle("License type") +
  ylab("Number of repositories") +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(lic_freq_plot)
Figure 5: Number of repositories per license

And a table with the actual numbers.

repo_info$license |>
  table() |>
  as.data.frame() |>
  mutate(perc = round(Freq/nrow(repo_info)*100, 1)) |>
  arrange(desc(Freq)) |>
  knitr::kable()
Table 4: Number of repositories per license
Var1 Freq perc
other 226 33.7
none 192 28.7
mit 74 11.0
cc-by-sa-4.0 48 7.2
cc-by-4.0 44 6.6
gpl-3.0 30 4.5
cc0-1.0 26 3.9
apache-2.0 12 1.8
bsd-3-clause 12 1.8
agpl-3.0 2 0.3
artistic-2.0 2 0.3
unlicense 1 0.1
wtfpl 1 0.1

Number of repositories per country

This is figure 2F in the mansucript.

country_freq <- table(country = repo_info$country, 
                      main_category = repo_info$main_category) |>
  as.data.frame()

country_freq$main_category <- factor(country_freq$main_category,
                                     levels = names(cat_table))

country_freq_plot <- country_freq |>
  filter(country != "undefined") |>
  ggplot(aes(x = reorder(country, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Country") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 70,
           label = paste("Repos with undefined country: ",
                         sum(repo_info$country == "undefined")),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(country_freq_plot)
Figure 6: Number of repositories per country colored by category

And a table with the actual numbers.

repo_info$country |> 
  table() |> 
  as.data.frame() |> 
  arrange(desc(Freq)) |> 
  knitr::kable()
Table 5: Number of repositories per country
Var1 Freq
undefined 269
United States 181
Canada 32
Switzerland 31
Sweden 23
United Kingdom 23
Australia 15
France 14
Germany 14
Netherlands 12
Portugal 11
Belgium 10
Spain 8
Denmark 4
India 4
Norway 4
Ireland 3
Italy 3
Bulgaria 2
Argentina 1
China 1
Finland 1
Luxembourg 1
Mexico 1
Poland 1
Ukraine 1

Summary plot

Full figure 2 of the manuscript.

p <- plot_grid(cat_count_plot, contributors_plot, 
          tag_freq_plot, author_freq_plot,  
          lic_freq_plot, country_freq_plot,
          ncol = 2, labels = LETTERS[1:6],
          rel_heights = c(2,3,3))

ggsave("grid_plot_fig2.pdf", width = 10, height = 10)
ggsave("grid_plot_fig2.eps", width = 10, height = 10)