Glittr stats

In this report you can find some general statistics about Glittr.org. The plots and statistics created are amongst others used in the manuscript. Since Glittr.org is an ongoing project these statistics are updated weekly.

Set up the environment

This is required if you run this notebook locally. Loading required packages.

To run locally, create a file named .env and add your GitHub PAT (variable named PAT ) and google api key (named GOOGLE_API_KEY) in there, e.g.:

# this is an example, store it as .env:
export PAT="ghp_aRSRESCTZII20Lklser3H"
export GOOGLE_API_KEY="AjKSLE5SklxuRsxwPP8s0"

Now use your UNIX terminal to source this file to get the keys as objects:

source .env

In R, get environment variables as objects:

pat <- Sys.getenv("PAT")
google_api_key <- Sys.getenv("GOOGLE_API_KEY")
matomo_api_key <- Sys.getenv("MATOMO_API_KEY")

Setting colors. These correspond to the category colours on glittr.org.

glittr_cols <- c(
  "Scripting and languages" =             "#3a86ff",
  "Computational methods and pipelines" = "#fb5607",
  "Omics analysis" =                      "#ff006e",
  "Reproducibility and data management" = "#ffbe0b",
  "Statistics and machine learning" =     "#8338ec",
  "Others" =                              "#000000")

Parse repository data

Using the glittr.org REST API to get repository metadata, among which the stargazers, recency, category, license and tags.

Code
response <- request("https://glittr.org/api/repositories") |>
  req_perform() |>
  resp_body_json()

# extract relevant items as dataframe
repo_info_list <- lapply(response$data, function(x) {
  if(is.null(x$author$name)) {
    warning(sprintf("%s has no author specified", x$name))
  } else {
    data.frame(
    repo = x$name,
    author_name = x$author$name,
    stargazers = x$stargazers,
    recency = x$days_since_last_push,
    url = x$url,
    license = ifelse(is.null(x$license), "none", x$license),
    main_tag = x$tags[[1]]$name,
    main_category = x$tags[[1]]$category,
    website = x$website,
    author_profile = x$author$profile,
    author_website = x$author$website)
  }
})
Warning in FUN(X[[i]], ...):
ucdavis-bioinformatics-training/2020-Bioinformatics_Prerequisites_Workshop has
no author specified
Code
repo_info <- do.call(rbind, repo_info_list)

# create a column with provider (either github or gitlab)
repo_info$provider <- ifelse(grepl("github", repo_info$url), "github", "gitlab")

# create a factor for categories for sorting
repo_info$main_category <- factor(repo_info$main_category,
                                  levels = names(glittr_cols))

# category table to keep order the same in the plots
cat_table <- table(category = repo_info$main_category)
cat_table <- sort(cat_table)

Number of repositories: 797

SIB courses not yet available on Glittr

Extracting course material URLs from the Bioschemas/JSON-LD markup on the SIB training materials page, then comparing those URLs with repository URLs that are already on Glittr.

Code
sib_request <- request("https://www.sib.swiss/training/training-materials")

sib_materials_page <- tryCatch(
  sib_request |>
    req_options(http_version = 2L) |>
    req_perform() |>
    resp_body_string(),
  error = function(e) {
    sib_request |>
      req_options(http_version = 1L) |>
      req_perform() |>
      resp_body_string()
  }
)

sib_doc <- read_html(sib_materials_page)
ld_json_blocks <- xml_find_all(sib_doc, "//script[@type='application/ld+json']") |>
  xml_text(trim = TRUE)

is_course_node <- function(x) {
  if(!is.list(x) || is.null(x$`@type`)) return(FALSE)
  "Course" %in% as.character(x$`@type`)
}

collect_course_nodes <- function(x) {
  out <- list()
  if(is_course_node(x)) {
    out <- c(out, list(x))
  }
  if(is.list(x)) {
    for(el in x) {
      out <- c(out, collect_course_nodes(el))
    }
  }
  out
}

course_nodes <- list()
for(block in ld_json_blocks) {
  parsed <- tryCatch(
    jsonlite::fromJSON(block, simplifyVector = FALSE),
    error = function(e) NULL
  )
  if(!is.null(parsed)) {
    course_nodes <- c(course_nodes, collect_course_nodes(parsed))
  }
}

extract_course_materials <- function(course) {
  instances <- course$hasCourseInstance
  if(is.null(instances)) return(NULL)
  if(!is.list(instances) || (!is.null(instances$`@type`))) {
    instances <- list(instances)
  }

  rows <- lapply(instances, function(instance) {
    data.frame(
      course = ifelse(is.null(course$name), NA, course$name),
      course_instance_url = ifelse(is.null(instance$url), NA, instance$url),
      material_url = ifelse(is.null(instance$workFeatured$url), NA,
                            instance$workFeatured$url),
      stringsAsFactors = FALSE
    )
  })

  do.call(rbind, rows)
}

course_materials <- lapply(course_nodes, extract_course_materials)
course_materials <- course_materials[!sapply(course_materials, is.null)]
course_materials <- do.call(rbind, course_materials)

course_materials <- course_materials |>
  mutate(
    material_url = gsub("&amp;", "&", material_url, fixed = TRUE),
    course_instance_url = gsub("&amp;", "&", course_instance_url, fixed = TRUE)
  ) |>
  distinct(course, material_url, .keep_all = TRUE)

normalize_repo_url <- function(url) {
  if(is.na(url) || !nzchar(url)) return(NA_character_)

  x <- trimws(tolower(url))
  x <- gsub("&amp;", "&", x, fixed = TRUE)
  x <- sub("^https?://", "", x)
  x <- sub("^www\\.", "", x)
  x <- sub("\\?.*$", "", x)
  x <- sub("#.*$", "", x)
  x <- sub("/$", "", x)
  x <- sub("\\.git$", "", x)

  if(grepl("^[^/]+\\.github\\.io/", x)) {
    parts <- strsplit(x, "/")[[1]]
    org <- sub("\\.github\\.io$", "", parts[1])
    repo <- if(length(parts) >= 2) parts[2] else NA_character_
    if(!is.na(repo) && nzchar(repo)) {
      x <- paste("github.com", org, repo, sep = "/")
    }
  }

  if(grepl("^[^/]+\\.gitlab\\.io/", x)) {
    parts <- strsplit(x, "/")[[1]]
    group <- sub("\\.gitlab\\.io$", "", parts[1])
    repo <- if(length(parts) >= 2) parts[2] else NA_character_
    if(!is.na(repo) && nzchar(repo)) {
      x <- paste("gitlab.com", group, repo, sep = "/")
    }
  }

  if(grepl("^github\\.com/", x)) {
    parts <- strsplit(x, "/")[[1]]
    if(length(parts) >= 3) {
      x <- paste(parts[1:3], collapse = "/")
    }
  }

  if(grepl("^gitlab(\\.com|\\.sib\\.swiss)/", x)) {
    x <- sub("/-/.+$", "", x)
    parts <- strsplit(x, "/")[[1]]
    if(length(parts) >= 3) {
      x <- paste(parts[1:3], collapse = "/")
    }
  }

  x
}

course_materials <- course_materials |>
  mutate(material_key = sapply(material_url, normalize_repo_url))

course_materials_repo_like <- course_materials |>
  filter(grepl("^(github\\.com|gitlab\\.com)/", material_key))

glittr_repo_keys <- unique(sapply(repo_info$url, normalize_repo_url))

missing_courses_on_glittr <- course_materials_repo_like |>
  filter(!(material_key %in% glittr_repo_keys)) |>
  distinct(course, material_key) |>
  arrange(course)

Number of SIB courses with repository-like materials not yet available on Glittr: 0

missing_courses_on_glittr |>
  rename(course_title = course) |>
  knitr::kable(row.names = FALSE)
Table 1: SIB training courses with course-material repositories not yet available on Glittr
course_title material_key

TeSS materials not yet available on Glittr

Using the TeSS API materials endpoint, extracting only resource links from GitHub/GitLab (including GitHub/GitLab Pages), and comparing those repositories with Glittr.

Code
tess_request <- request("https://tess.elixir-europe.org/materials") |>
  req_headers(Accept = "application/vnd.api+json")

fetch_tess_page <- function(page_number = 1, page_size = 200) {
  req <- tess_request |>
    req_url_query(page_number = page_number, page_size = page_size)

  resp <- tryCatch(
    req |>
      req_options(http_version = 2L) |>
      req_perform(),
    error = function(e) {
      req |>
        req_options(http_version = 1L) |>
        req_perform()
    }
  )

  resp_body_json(resp, simplifyVector = FALSE)
}

tess_pages <- list()
page_number <- 1
repeat {
  parsed <- fetch_tess_page(page_number = page_number, page_size = 200)
  tess_pages[[length(tess_pages) + 1]] <- parsed

  has_next <- !is.null(parsed$links$`next`)
  if(!has_next || length(parsed$data) == 0) break
  page_number <- page_number + 1
}

tess_materials <- do.call(c, lapply(tess_pages, function(x) x$data))

extract_tess_resource_urls <- function(material) {
  attrs <- material$attributes
  if(is.null(attrs)) return(character(0))

  urls <- c(
    attrs$url,
    attrs$doi
  )

  ext <- attrs[["external-resources"]]
  if(is.list(ext) && length(ext) > 0) {
    for(e in ext) {
      if(is.list(e) && !is.null(e$url)) {
        urls <- c(urls, e$url)
      } else if(is.character(e)) {
        urls <- c(urls, e)
      }
    }
  }

  urls <- urls[!is.na(urls) & nzchar(urls)]
  unique(urls)
}

tess_resources <- lapply(tess_materials, function(material) {
  attrs <- material$attributes
  urls <- extract_tess_resource_urls(material)

  if(length(urls) == 0) return(NULL)

  data.frame(
    material_title = ifelse(is.null(attrs$title), NA, attrs$title),
    tess_material_url = ifelse(is.null(material$links$self), NA,
                               paste0("https://tess.elixir-europe.org",
                                      material$links$self)),
    resource_url = urls,
    stringsAsFactors = FALSE
  )
})

tess_resources <- tess_resources[!sapply(tess_resources, is.null)]
tess_resources <- do.call(rbind, tess_resources)

tess_resources <- tess_resources |>
  mutate(resource_key = sapply(resource_url, normalize_repo_url)) |>
  filter(grepl("^(github\\.com|gitlab\\.com)/", resource_key))

tess_missing_on_glittr <- tess_resources |>
  filter(!(resource_key %in% glittr_repo_keys)) |>
  distinct(resource_key)

Number of TeSS materials with GitHub/GitLab resources not yet available on Glittr: 90

tess_missing_on_glittr |>
  rename(repository = resource_key) |>
  knitr::kable(row.names = FALSE)
Table 2: TeSS materials with GitHub/GitLab resources not yet available on Glittr
repository
github.com/elixir-cloud-aai/tutorials
github.com/vib-tcp/gentle-hands-on-python
github.com/vib-tcp/introduction-github
github.com/vib-tcp/containers-workshop
github.com/marineomics/admin_03_panels.html
github.com/bgacademy23/easel-annotation
github.com/nadegeguiglielmoni/embo-24-genome-sequencing
github.com/ccgproject/ccgp_assembly
github.com/kamilsjaron/k-mer-approaches-for-biodiversity-genomics
github.com/524d/comparems2
github.com/patterninstitute/osd758
github.com/vib-tcp/genai_4_training-trainingmaterial
github.com/vib-tcp/functional_analysis_training
github.com/gtpb/am21
github.com/pydna-group/pydna
github.com/hbctraining/intro-to-bulk-rnaseq
github.com/hbctraining/intro-to-scrnaseq
github.com/zemzemfiras1/nf-core-pre-hackathon_training2025
github.com/hbctraining/tools-for-reproducible-research
github.com/hbctraining/intro-to-peak-analysis
github.com/hbctraining/intro-to-variant-analysis
github.com/hbctraining/shell-for-bioinformatics
github.com/hbctraining/investigating-chromatin-biology-chipseq
github.com/hbctraining/intro-to-dge
github.com/gallantries/video-library
github.com/elixirestonia/2024-11-06-git
github.com/biodata-pt/computational-tools-resources
github.com/gladstone-institutes/bioinformatics-presentations
github.com/elixir-europe-training/elixir-trp-fair-material-by-design
github.com/hds-sandbox/rdm_biodata_course
github.com/carpentries-incubator/fair-for-busy-biologists
github.com/sophie-a-lee/intro-rstudio-mhclg
github.com/carpentries-incubator/fair-research-software
github.com/nutriome/workshop1
github.com/sbwiecko/intuitive_biostatistics
github.com/mlabonne/blog
github.com/biologia-computacional/posts
github.com/nesper94/teaching
github.com/ib-ulfri/instructor-notes
github.com/posit-dev/py-shiny-workshop
github.com/ucdavis-bioinformatics-training/2020-bioinformatics_prerequisites_workshop
github.com/vibbits/rdm-introductory-course
github.com/ics80-fa21/website
github.com/liascript/course
github.com/mjfrigaard/shinypak
github.com/tidyomics/tidy-ranges-tutorial
github.com/vibbits/nextflow-workshop
github.com/carpentries-incubator/reproducible-publications-quarto
github.com/carpentries-lab/good-enough-practices
github.com/carpentries-lab/deep-learning-intro
github.com/davidruvolo51/shinytutorials
github.com/elixir-europe-training/elixir-trp-tess
github.com/vib-tcp/nextflow-workshop
gitlab.com/chrjan/seq-seq-pan
github.com/carpentries/lesson-development-training
github.com/fairplus/fair_wizard
github.com/lcsb-biocore/cobrexa.jl
github.com/saezlab/cellnoptr
github.com/bridgedb/bridgedb-matlab
github.com/pathvisio/tutorials
github.com/dalalghamdi/ngbo
github.com/laurendupuis/scholia_tutorial
github.com/miappe/training
github.com/karrlab/de_sim
github.com/gtpb/elb18s
github.com/gtpb/ader18s
github.com/gtpb/cpang18
github.com/gtpb/pda18
github.com/gtpb/ader18f
github.com/gtpb/elb18f
github.com/gtpb/3daroc18
github.com/gtpb/pgdh18
github.com/cbg-ethz/v-pipe
github.com/opencobra/cobratoolbox
github.com/openrisknet/workshop
github.com/eudat-training/b2safe-b2stage-training
github.com/uclouvain-cbio/bss2019
github.com/enanomapper/tutorials
github.com/egonw/cdkbook
github.com/vjirsa/bootcamp
github.com/trainthetrainer/elixir-excelerate-ttt
github.com/nanocommons/tutorials
github.com/bigcat-um/pils
github.com/bigcat-um/bridgedbvariantdatabase
github.com/egonw/metawinterschool-bigcat
github.com/bioinformaticsdotca/genomic_med_2017
github.com/vdda/revealjs_test
github.com/swcarpentry/bc
github.com/datacarpentry/python-ecology
github.com/datacarpentry/openrefine-ecology-lesson

Get contributors info

Using the GitHub REST API to get the number of contributors for each repository on glittr.org. This takes a few minutes, so if the contributors haven’t changed, it will use a cached version.

Code
# take long time to run, so try to use cache results if no repos have been 
# added in the meantime

# check if data/n_contributors.rds exists
if(file.exists("data/n_contributors.rds")) {
  n_contributors <- readRDS("data/n_contributors.rds")
} else {
  n_contributors <- NULL
}

# get contributors info only from github repos
repo_info_gh <- repo_info[repo_info$provider == "github", ]

# get contributor info from github api if update is needed
if(!identical(sort(repo_info_gh$repo), sort(names(n_contributors)))) {
  dir.create("data", showWarnings = FALSE)
  n_contributors <- sapply(repo_info_gh$repo, function(x) {
    
    # get repo contributors
    resp <- request("https://api.github.com/repos/") |>
      req_url_path_append(x) |>
      req_url_path_append("contributors") |>
      req_url_query(per_page = 1) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() 
    
    link_url <- resp_link_url(resp, "last")
    if(is.null(link_url)) {
      return(1)
    } else {
      npages <- strsplit(link_url, "&page=")[[1]][2] |> as.numeric()
      return(npages)
    }
  })
  
  # overwrite rds file
  saveRDS(n_contributors, "data/n_contributors.rds")
}

repo_info_gh$contributors <- n_contributors[repo_info_gh$repo]

Get country information

Here we get country information for all authors and organizations. It uses the free text specified at ‘location’. Since this can be anything, we use the google REST API to translate that into country.

Code
# check whether author info exists for caching
if(file.exists("data/author_info.rds")) {
  author_info <- readRDS("data/author_info.rds")
  author_info_authors <- unique(author_info$author) |> sort()
} else {
  author_info_authors <- NULL
}

gh_authors <- repo_info$author_name[repo_info$provider == "github"] |>
  unique() |>
  sort()

# if the author info is out of date, update it
if(!identical(gh_authors, author_info_authors)) {
  author_info_list <- list()
  for(author in gh_authors) {
    
    parsed <- request("https://api.github.com/users/") |>
      req_url_path_append(author) |>
      req_headers(
        Accept = "application/vnd.github+json",
        Authorization = paste("Bearer", pat),
        `X-GitHub-Api-Version` = "2022-11-28",
      ) |>
      req_perform() |>
      resp_body_json()
    
    author_info_list[[author]] <- data.frame(
      author = parsed$login,
      type = parsed$type,
      name = ifelse(is.null(parsed$name), NA, parsed$name),
      location = ifelse(is.null(parsed$location), NA, parsed$location)
    )
  }
  
  author_info <- do.call(rbind, author_info_list)
  
  author_info_loc <- author_info[!is.na(author_info$location), ]
  
  author_loc <- author_info_loc$location
  names(author_loc) <- author_info_loc$author
  
  ggmap::register_google(key = google_api_key)
  loc_info <- ggmap::geocode(author_loc,
                             output = 'all')
  
  get_country <- function(loc_results) {
    if("results" %in% names(loc_results)) {
      for(results in loc_results$results) {
        address_info <- results$address_components |> 
          lapply(unlist) |> 
          do.call(rbind, args = _) |>
          as.data.frame()
        country <- address_info$long_name[address_info$types1 == "country"]
        if (length(country) == 0) next
      }
      if (length(country) == 0) return(NA)
      return(country)
    } else {
      return(NA)
    }
  }
  
  countries <- sapply(loc_info, get_country)
  names(countries) <- names(author_loc)
  
  author_info$country <- countries[author_info$author]
  
  saveRDS(author_info, "data/author_info.rds")
}

repo_info <- merge(repo_info, author_info, by.x = "author_name",
                   by.y = "author")
repo_info$country[is.na(repo_info$country)] <- "undefined"
  • Number of authors: 365
  • Number of countries: 30

Parse tag data

Here, we create tag_df that contains information for each tag by using the glittr.org API.

parsed <- request("https://glittr.org/api/tags") |>
  req_perform() |>
  resp_body_json()

tag_dfs <- list()
for(i in seq_along(parsed)) {
  category <- parsed[[i]]$category
  name <- sapply(parsed[[i]]$tags, function(x) x$name)
  repositories <- sapply(parsed[[i]]$tags, function(x) x$repositories)
  tag_dfs[[category]] <- data.frame(name, category, repositories)
}

tag_df <- do.call(rbind, tag_dfs) |> arrange(repositories)

Number of tags/topics: 67

Number of repositories by category

This is figure 2A in the manuscript.

cat_count_plot <- table(category = repo_info$main_category) |>
  as.data.frame() |>
  ggplot(aes(x = reorder(category, Freq), y = Freq, fill = category)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = glittr_cols) +
  coord_flip() +
  theme_classic() +
  ggtitle("Categories") +
  theme(legend.position = "none",
        axis.title.y = element_blank()) +
  ylab("Number of repositories")

print(cat_count_plot)
Figure 1: Number of repositories per category

And a table with the actual numbers

category_count <- table(category = repo_info$main_category) |> as.data.frame()
knitr::kable(category_count)
Table 3: Number of repositories per category
category Freq
Scripting and languages 338
Computational methods and pipelines 56
Omics analysis 209
Reproducibility and data management 58
Statistics and machine learning 103
Others 30

Number of contributors per repository separated by category

This is figure 2B in the manuscript.

repo_info_gh$main_category <- factor(repo_info_gh$main_category,
                                     levels = names(cat_table))

contributors_plot <- repo_info_gh |>
  ggplot(aes(x = main_category, y = contributors, fill = main_category)) +
  geom_violin(scale = "width") +
  geom_boxplot(width = 0.1, col = "darkgrey") +
  coord_flip() +
  ggtitle("Contributors") +
  ylab("Number of contributors") +
  scale_y_sqrt() +
  scale_fill_manual(values = glittr_cols) +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_blank(),
        plot.margin = margin(t = 5, r = 10, b = 5, l = 10))

print(contributors_plot)
Figure 2: Number of contributors per repository separated by category

And some statistics of contributors.

nna_contr <- repo_info_gh$contributors
param1 <- sum(nna_contr > 10)/length(nna_contr)
param2 <- sum(nna_contr > 1)/length(nna_contr)
param3 <- sum(nna_contr <= 5)/length(nna_contr)
  • More than 10 contributors: 23.8%
  • More than 1 contributor: 78.2%
  • Between 1 and 5 contributors: 62%

Number of repositories per tag

This is figure 2C in the manuscript.

tag_freq_plot <- tag_df |>
  filter(repositories > 10) |>
  ggplot(aes(x = reorder(name, repositories),
             y = repositories, fill = category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  ggtitle("Tags with > 10 repositories") +
  ylab("Number of repositories") +
  annotate(geom = "text", x = 2, y = 150,
           label = paste("Total number of tags: ",
                         nrow(tag_df)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(tag_freq_plot)
Figure 3: Number of repostories per tag, colored by category.

And a table with the actual numbers.

tag_df |>
  filter(repositories > 10) |>
  arrange(desc(repositories)) |>
  knitr::kable(row.names = FALSE)
Table 4: Number of repositories per tag
name category repositories
R Scripting and languages 292
Python Scripting and languages 119
Transcriptomics Omics analysis 104
RNA-seq Omics analysis 97
Next generation sequencing Omics analysis 87
Genomics Omics analysis 74
Statistics Statistics and machine learning 71
Data science Statistics and machine learning 63
Machine learning Statistics and machine learning 62
Single-cell sequencing Omics analysis 59
Data management Reproducibility and data management 54
Unix/Linux Scripting and languages 49
Version control Scripting and languages 48
Reproducibility Reproducibility and data management 47
Data visualization Scripting and languages 44
FAIR data Reproducibility and data management 39
Variant analysis Omics analysis 37
General Others 36
Workflows Computational methods and pipelines 24
Metagenomics Omics analysis 24
Containerization Computational methods and pipelines 21
Shiny Scripting and languages 20
Spatial transcriptomics Omics analysis 20
Genome assembly Omics analysis 19
Nextflow Computational methods and pipelines 17
ChIP-seq Omics analysis 17
Microbiology Omics analysis 17
Docker Computational methods and pipelines 16
Quarto Scripting and languages 14
Julia Scripting and languages 14
High performance computing Computational methods and pipelines 14
Long read sequencing Omics analysis 14
Artificial intelligence Statistics and machine learning 14
Image analysis Computational methods and pipelines 13
ATAC-seq Omics analysis 13
Genome annotation Omics analysis 13
Epigenetics Omics analysis 11

Number of repositories by author

This is figure 2D in the manuscript.

author_freq <- table(author_name = repo_info$author_name, 
                     main_category = repo_info$main_category) |>
  as.data.frame()

author_freq$main_category <- factor(author_freq$main_category,
                                     levels = names(cat_table))

repos_per_author <- table(repo_info$author_name)

lf_authors <- names(repos_per_author)[repos_per_author < 5]

author_freq_plot <- author_freq |>
  filter(!author_name %in% lf_authors) |>
  arrange(Freq) |>
  ggplot(aes(x = reorder(author_name, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Author or organization") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 30,
           label = paste("Authors with < 5 repos: ",
                         length(lf_authors)),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(author_freq_plot)
Figure 4: Number of repositories per author colored by category

And a table with the actual numbers.

table(repo_info$author_name) |>
  as.data.frame() |>
  filter(Freq >= 5) |>
  arrange(desc(Freq)) |>
  knitr::kable()
Table 5: Number of repositories per author
Var1 Freq
carpentries-incubator 51
sib-swiss 42
bioinformaticsdotca 34
NBISweden 22
posit-conf-2023 20
hbctraining 17
posit-conf-2024 17
ucdavis-bioinformatics-training 17
datacarpentry 16
bioinformatics-core-shared-training 13
GTPB 12
bioinformatics-ca 10
fhdsl 10
rstudio-conf-2022 9
learnbyexample 8
vib-training-conferences 8
biocorecrg 7
ELIXIREstonia 7
RockefellerUniversity 7
semacu 7
BU-ISCIII 6
JuliaAcademy 6
swcarpentry 6
cambiotraining 5
carpentries-lab 5
elixir-europe-training 5
hadley 5
jhudsl 5
MolSSI-Education 5

Number of repositories per license

This is figure 2E in the manuscript.

lic_freq_data <- table(license = repo_info$license,
                       main_category = repo_info$main_category) |>
  as.data.frame()

lic_freq_data$main_category <- factor(lic_freq_data$main_category,
                                     levels = names(cat_table))

lic_freq_plot <- lic_freq_data |>
  ggplot(aes(x = reorder(license, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = glittr_cols) +
  theme_classic() +
  ggtitle("License type") +
  ylab("Number of repositories") +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(lic_freq_plot)
Figure 5: Number of repositories per license

And a table with the actual numbers.

repo_info$license |>
  table() |>
  as.data.frame() |>
  mutate(perc = round(Freq/nrow(repo_info)*100, 1)) |>
  arrange(desc(Freq)) |>
  knitr::kable()
Table 6: Number of repositories per license
Var1 Freq perc
other 248 31.2
none 239 30.1
mit 92 11.6
cc-by-4.0 73 9.2
cc-by-sa-4.0 50 6.3
gpl-3.0 34 4.3
cc0-1.0 27 3.4
bsd-3-clause 13 1.6
apache-2.0 12 1.5
agpl-3.0 2 0.3
artistic-2.0 2 0.3
unlicense 1 0.1
wtfpl 1 0.1

Number of repositories per country

This is figure 2F in the mansucript.

country_freq <- table(country = repo_info$country, 
                      main_category = repo_info$main_category) |>
  as.data.frame()

country_freq$main_category <- factor(country_freq$main_category,
                                     levels = names(cat_table))

country_freq_plot <- country_freq |>
  filter(country != "undefined") |>
  ggplot(aes(x = reorder(country, Freq), y = Freq, fill = main_category)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  ggtitle("Country") +
  ylab("Number of repositories") +
  scale_fill_manual(values = glittr_cols) +
  annotate(geom = "text", x = 2, y = 70,
           label = paste("Repos with undefined country: ",
                         sum(repo_info$country == "undefined")),
           color="black") +
  theme_classic() +
  theme(legend.position = "none",
        axis.title.y = element_blank())

print(country_freq_plot)
Figure 6: Number of repositories per country colored by category

And a table with the actual numbers.

repo_info$country |> 
  table() |> 
  as.data.frame() |> 
  arrange(desc(Freq)) |> 
  knitr::kable()
Table 7: Number of repositories per country
Var1 Freq
undefined 311
United States 194
Canada 52
Switzerland 51
United Kingdom 30
Sweden 26
Australia 18
Germany 18
Belgium 15
France 13
Portugal 13
Netherlands 12
Spain 9
Denmark 5
India 4
Norway 4
Italy 3
Bulgaria 2
Ireland 2
Luxembourg 2
Argentina 1
China 1
Czechia 1
Estonia 1
Finland 1
Mexico 1
New Zealand 1
Poland 1
South Africa 1
Ukraine 1

Summary plot

Full figure 2 of the manuscript.

p <- plot_grid(cat_count_plot, contributors_plot, 
          tag_freq_plot, author_freq_plot,  
          lic_freq_plot, country_freq_plot,
          ncol = 2, labels = LETTERS[1:6],
          rel_heights = c(2,3,3))

ggsave("grid_plot_fig2.pdf", width = 10, height = 10)
ggsave("grid_plot_fig2.eps", width = 10, height = 10)