Glittr stats
In this report you can find some general statistics about Glittr.org. The plots and statistics created are amongst others used in the manuscript. Since Glittr.org is an ongoing project these statistics are updated weekly.
Set up the environment
This is required if you run this notebook locally. Loading required packages.
To run locally, create a file named .env
and add your GitHub PAT (variable named PAT
) and google api key (named GOOGLE_API_KEY
) in there, e.g.:
# this is an example, store it as .env:
export PAT="ghp_aRSRESCTZII20Lklser3H"
export GOOGLE_API_KEY="AjKSLE5SklxuRsxwPP8s0"
Now use your UNIX terminal to source this file to get the keys as objects:
source .env
In R, get environment variables as objects:
pat <- Sys.getenv("PAT")
google_api_key <- Sys.getenv("GOOGLE_API_KEY")
Setting colors. These correspond to the category colours on glittr.org.
glittr_cols <- c(
"Scripting and languages" = "#3a86ff",
"Computational methods and pipelines" = "#fb5607",
"Omics analysis" = "#ff006e",
"Reproducibility and data management" = "#ffbe0b",
"Statistics and machine learning" = "#8338ec",
"Others" = "#000000")
Parse repository data
Using the glittr.org REST API to get repository metadata, among which the stargazers, recency, category, license and tags.
Code
# get all repositories content as nested list
parsed <- request("https://glittr.org/api/repositories") |>
req_perform() |>
resp_body_json()
# extract relevant items as dataframe
repo_info_list <- lapply(parsed$data, function(x) data.frame(
repo = x$name,
author_name = x$author$name,
stargazers = x$stargazers,
recency = x$days_since_last_push,
url = x$url,
license = ifelse(is.null(x$license), "none", x$license),
main_tag = x$tags[[1]]$name,
main_category = x$tags[[1]]$category
))
repo_info <- do.call(rbind, repo_info_list)
# create a column with provider (either github or gitlab)
repo_info$provider <- ifelse(grepl("github", repo_info$url), "github", "gitlab")
# create a factor for categories for sorting
repo_info$main_category <- factor(repo_info$main_category,
levels = names(glittr_cols))
# category table to keep order the same in the plots
cat_table <- table(category = repo_info$main_category)
cat_table <- sort(cat_table)
Number of repositories: 664
Get contributors info
Using the GitHub REST API to get the number of contributors for each repository on glittr.org. This takes a few minutes, so if the contributors haven’t changed, it will use a cached version.
Code
# take long time to run, so try to use cache results if no repos have been
# added in the meantime
# check if data/n_contributors.rds exists
if(file.exists("data/n_contributors.rds")) {
n_contributors <- readRDS("data/n_contributors.rds")
} else {
n_contributors <- NULL
}
# get contributors info only from github repos
repo_info_gh <- repo_info[repo_info$provider == "github", ]
# get contributor info from github api if update is needed
if(!identical(sort(repo_info_gh$repo), sort(names(n_contributors)))) {
dir.create("data", showWarnings = FALSE)
n_contributors <- sapply(repo_info_gh$repo, function(x) {
# get repo contributors
resp <- request("https://api.github.com/repos/") |>
req_url_path_append(x) |>
req_url_path_append("contributors") |>
req_url_query(per_page = 1) |>
req_headers(
Accept = "application/vnd.github+json",
Authorization = paste("Bearer", pat),
`X-GitHub-Api-Version` = "2022-11-28",
) |>
req_perform()
link_url <- resp_link_url(resp, "last")
if(is.null(link_url)) {
return(1)
} else {
npages <- strsplit(link_url, "&page=")[[1]][2] |> as.numeric()
return(npages)
}
})
# overwrite rds file
saveRDS(n_contributors, "data/n_contributors.rds")
}
repo_info_gh$contributors <- n_contributors[repo_info_gh$repo]
Get country information
Here we get country information for all authors and organizations. It uses the free text specified at ‘location’. Since this can be anything, we use the google REST API to translate that into country.
Code
# check whether author info exists for caching
if(file.exists("data/author_info.rds")) {
author_info <- readRDS("data/author_info.rds")
author_info_authors <- unique(author_info$author) |> sort()
} else {
author_info_authors <- NULL
}
gh_authors <- repo_info$author_name[repo_info$provider == "github"] |>
unique() |>
sort()
# if the author info is out of date, update it
if(!identical(gh_authors, author_info_authors)) {
author_info_list <- list()
for(author in gh_authors) {
parsed <- request("https://api.github.com/users/") |>
req_url_path_append(author) |>
req_headers(
Accept = "application/vnd.github+json",
Authorization = paste("Bearer", pat),
`X-GitHub-Api-Version` = "2022-11-28",
) |>
req_perform() |>
resp_body_json()
author_info_list[[author]] <- data.frame(
author = parsed$login,
type = parsed$type,
name = ifelse(is.null(parsed$name), NA, parsed$name),
location = ifelse(is.null(parsed$location), NA, parsed$location)
)
}
author_info <- do.call(rbind, author_info_list)
author_info_loc <- author_info[!is.na(author_info$location), ]
author_loc <- author_info_loc$location
names(author_loc) <- author_info_loc$author
ggmap::register_google(key = google_api_key)
loc_info <- ggmap::geocode(author_loc,
output = 'all')
get_country <- function(loc_results) {
if("results" %in% names(loc_results)) {
for(results in loc_results$results) {
address_info <- results$address_components |>
lapply(unlist) |>
do.call(rbind, args = _) |>
as.data.frame()
country <- address_info$long_name[address_info$types1 == "country"]
if (length(country) == 0) next
}
if (length(country) == 0) return(NA)
return(country)
} else {
return(NA)
}
}
countries <- sapply(loc_info, get_country)
names(countries) <- names(author_loc)
author_info$country <- countries[author_info$author]
saveRDS(author_info, "data/author_info.rds")
}
repo_info <- merge(repo_info, author_info, by.x = "author_name",
by.y = "author")
repo_info$country[is.na(repo_info$country)] <- "undefined"
- Number of authors: 313
- Number of countries: 26
Parse tag data
Here, we create tag_df
that contains information for each tag by using the glittr.org API.
parsed <- request("https://glittr.org/api/tags") |>
req_perform() |>
resp_body_json()
tag_dfs <- list()
for(i in seq_along(parsed)) {
category <- parsed[[i]]$category
name <- sapply(parsed[[i]]$tags, function(x) x$name)
repositories <- sapply(parsed[[i]]$tags, function(x) x$repositories)
tag_dfs[[category]] <- data.frame(name, category, repositories)
}
tag_df <- do.call(rbind, tag_dfs) |> arrange(repositories)
Number of tags/topics: 58
Number of repositories by category
This is figure 2A in the manuscript.
cat_count_plot <- table(category = repo_info$main_category) |>
as.data.frame() |>
ggplot(aes(x = reorder(category, Freq), y = Freq, fill = category)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = glittr_cols) +
coord_flip() +
theme_classic() +
ggtitle("Categories") +
theme(legend.position = "none",
axis.title.y = element_blank()) +
ylab("Number of repositories")
print(cat_count_plot)
And a table with the actual numbers
category_count <- table(category = repo_info$main_category) |> as.data.frame()
knitr::kable(category_count)
category | Freq |
---|---|
Scripting and languages | 367 |
Computational methods and pipelines | 65 |
Omics analysis | 102 |
Reproducibility and data management | 58 |
Statistics and machine learning | 42 |
Others | 28 |
Number of contributors per repository separated by category
This is figure 2B in the manuscript.
repo_info_gh$main_category <- factor(repo_info_gh$main_category,
levels = names(cat_table))
contributors_plot <- repo_info_gh |>
ggplot(aes(x = main_category, y = contributors, fill = main_category)) +
geom_violin(scale = "width") +
geom_boxplot(width = 0.1, col = "darkgrey") +
coord_flip() +
ggtitle("Contributors") +
ylab("Number of contributors") +
scale_y_sqrt() +
scale_fill_manual(values = glittr_cols) +
theme_bw() +
theme(legend.position = "none",
axis.title.y = element_blank(),
plot.margin = margin(t = 5, r = 10, b = 5, l = 10))
print(contributors_plot)
And some statistics of contributors.
- More than 10 contributors: 24.8%
- More than 1 contributor: 80.1%
- Between 1 and 5 contributors: 60.3%
Number of repositories per tag
This is figure 2C in the manuscript.
tag_freq_plot <- tag_df |>
filter(repositories > 10) |>
ggplot(aes(x = reorder(name, repositories),
y = repositories, fill = category)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = glittr_cols) +
ggtitle("Tags with > 10 repositories") +
ylab("Number of repositories") +
annotate(geom = "text", x = 2, y = 150,
label = paste("Total number of tags: ",
nrow(tag_df)),
color="black") +
theme_classic() +
theme(legend.position = "none",
axis.title.y = element_blank())
print(tag_freq_plot)
And a table with the actual numbers.
Number of repositories per license
This is figure 2E in the manuscript.
lic_freq_data <- table(license = repo_info$license,
main_category = repo_info$main_category) |>
as.data.frame()
lic_freq_data$main_category <- factor(lic_freq_data$main_category,
levels = names(cat_table))
lic_freq_plot <- lic_freq_data |>
ggplot(aes(x = reorder(license, Freq), y = Freq, fill = main_category)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = glittr_cols) +
theme_classic() +
ggtitle("License type") +
ylab("Number of repositories") +
theme(legend.position = "none",
axis.title.y = element_blank())
print(lic_freq_plot)
And a table with the actual numbers.
repo_info$license |>
table() |>
as.data.frame() |>
mutate(perc = round(Freq/nrow(repo_info)*100, 1)) |>
arrange(desc(Freq)) |>
knitr::kable()
Var1 | Freq | perc |
---|---|---|
other | 224 | 33.8 |
none | 191 | 28.9 |
mit | 74 | 11.2 |
cc-by-sa-4.0 | 48 | 7.3 |
cc-by-4.0 | 40 | 6.0 |
gpl-3.0 | 30 | 4.5 |
cc0-1.0 | 25 | 3.8 |
apache-2.0 | 12 | 1.8 |
bsd-3-clause | 12 | 1.8 |
agpl-3.0 | 2 | 0.3 |
artistic-2.0 | 2 | 0.3 |
unlicense | 1 | 0.2 |
wtfpl | 1 | 0.2 |
Number of repositories per country
This is figure 2F in the mansucript.
country_freq <- table(country = repo_info$country,
main_category = repo_info$main_category) |>
as.data.frame()
country_freq$main_category <- factor(country_freq$main_category,
levels = names(cat_table))
country_freq_plot <- country_freq |>
filter(country != "undefined") |>
ggplot(aes(x = reorder(country, Freq), y = Freq, fill = main_category)) +
geom_bar(stat = "identity") +
coord_flip() +
ggtitle("Country") +
ylab("Number of repositories") +
scale_fill_manual(values = glittr_cols) +
annotate(geom = "text", x = 2, y = 70,
label = paste("Repos with undefined country: ",
sum(repo_info$country == "undefined")),
color="black") +
theme_classic() +
theme(legend.position = "none",
axis.title.y = element_blank())
print(country_freq_plot)
And a table with the actual numbers.
repo_info$country |>
table() |>
as.data.frame() |>
arrange(desc(Freq)) |>
knitr::kable()
Var1 | Freq |
---|---|
undefined | 268 |
United States | 176 |
Canada | 31 |
Switzerland | 30 |
Sweden | 23 |
United Kingdom | 23 |
Australia | 15 |
France | 14 |
Germany | 14 |
Netherlands | 12 |
Portugal | 11 |
Belgium | 10 |
Spain | 8 |
Denmark | 4 |
India | 4 |
Norway | 4 |
Ireland | 3 |
Italy | 3 |
Bulgaria | 2 |
Argentina | 1 |
China | 1 |
Finland | 1 |
Luxembourg | 1 |
Mexico | 1 |
Poland | 1 |
Ukraine | 1 |
Summary plot
Full figure 2 of the manuscript.