### ### stevens_duque_2018_rcode.R ### Created by Jeffrey R. Stevens on 17 Jun 2016 (jeffrey.r.stevens@gmail.com), ### finalized on 12 Jul 2018 ### Summary: This script analyzes and produces plots for citation bias analysis. ### Instructions: Place this file and the data files in the main directory. ### Create a folder called "figures". Set the R working directory to the ### main directory. At the R command prompt, type ### > source("stevens_duque_2018_rcode.R") ### This will run the script, adding all of the calculated variables to the ### workspace and saving figures in the figures directory. If packages do not ### load properly, install them with install.packages("package_name"). ### Data files: ### stevens_duque_2018_data.csv ### data_set - signals first or second data set ### journal - name of the journal that published the article ### field - field for journal ### citation_style - citation style for journal at that time ### year - publication year for the article ### author - article's first author's surname and initials ### times_cited - number of times the article was cited according to Web of Science ### as of Jun 2016 (data set 1) or Apr 2017 (data set 2) ### License: This script is released under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 ### International license (CC BY-NC-SA 4.0). You may share and adapt this content with attribution, for ### non-commercial purposes if you ShareAlike (distribute any contributions under the same license). ### # Load libraries ---------------------------------------------------------- library(BayesFactor) # needed for Bayesian analysis library(car) # needed for Recode library(foreach) # needed for foreach library(grid) # needed for grid.text library(lme4) # needed for linear mixed models library(papaja) # needed for APA formatting library(scales) # needed for comma library(tidyverse) # needed for tidyverse functions # Define functions ---------------------------------------------------------- ### ## Create function that is the inverse of %in%, that is it finds items that do not match the vector ### "%notin%" <- function(x, table) { # create function analogous to %in% that searches for items not in a vector match(x, table, nomatch = 0) == 0 } ### ## Convert BIC values to Bayes factor (from Wagenmakers, E.-J. (2007). A practical solution to the pervasive problems of p values. Psychonomic Bulletin & Review, 14(5), 779–804. https://dx.doi.org/10.3758/BF03194105) ### bic_bf10 <- function(null, alternative) { new_bf <- exp((null - alternative) / 2) # convert BICs to Bayes factor names(new_bf) <- NULL # remove BIC label return(new_bf) # return Bayes factor of alternative over null hypothesis } ### ## Extract the top quantile of data for each letter ### extractTopQuantile <- function(citation_df) { top_quantile <- citation_df[1, ] # initiate data frame with first row of data foreach(letter = LETTERS) %do% { # for each letter current_letter_data <- subset(citation_df, author == letter) # subset the data for this letter current_quantile <- quantile(current_letter_data$times_cited, prob = seq(0, 0.9, length = num_quantiles))[quant] # find cutoff for specified quantile current_top_data <- subset(current_letter_data, times_cited >= current_quantile) # subset the data equal to or above the quantile cutoff top_quantile <- rbind(top_quantile, current_top_data) # append data } top_quantile <- top_quantile[-1, ] # remove initial row } ### ## Create function that calculates correlation coefficent, slope, intercept, and BF for each data group ### calculateCorrelationPerGroup <- function(citation_rate_group, group_names) { # Prepare new data frames group_citation_rate_df <- data.frame(matrix(rep(NA, 6), ncol = 6)) # initiate data frame for author data names(group_citation_rate_df) <- c("group", "author", "author_num", "mean_prob_cited", "mean_citation_rate", "percent_citation_rate") # label data frame columns names(citation_rate_group) <- names(group_citation_rate_df)[-length(group_citation_rate_df[1, ])] # remove last label group_citation_rate_stats <- group_prob_cited_stats <- data.frame(matrix(rep(NA, 7 * length(group_names)), ncol = 7)) # initiate data frame for statistics names(group_citation_rate_stats) <- names(group_prob_cited_stats) <- c("group", "tau", "tau_lowerCI", "tau_upperCI", "beta", "intercept", "bf") # label data frame columns group_counter <- 1 # initiate group counter # Calculate stats for each group foreach(current_group = group_names) %do% { # for each group # Filter data for this group and calculate percent citation rate if(length(group_names) > 1) { # if there is more than one group citation_letters_group <- subset(citation_rate_group, group == current_group) # filter data based on group } else { # if it is the overall analysis citation_letters_group <- citation_rate_group # use all data } citation_letters_group$percent_citation_rate <- citation_letters_group$mean_citation_rate / sum(citation_letters_group$mean_citation_rate, na.rm = TRUE) * 100 # calculate percentage for each letter citation_letters_group <- data.frame(citation_letters_group) # convert to data frame group_citation_rate_df <- rbind(group_citation_rate_df, citation_letters_group) # append to data frame of all groups # Create data frame of citation rate statistics per group group_citation_rate_stats$group[group_counter] <- group_prob_cited_stats$group[group_counter] <- current_group # assign group group_citation_rate_stats$tau[group_counter] <- cor.test(citation_letters_group$percent_citation_rate, citation_letters_group$author_num, method = "kendall")$estimate # extract Kendall correlation coefficient group_citation_rate_stats$tau_lowerCI[group_counter] <- as.numeric(credibleIntervalKendallTau(group_citation_rate_stats$tau[group_counter], 26)[1]) # calculate lower Bayesian credible interval group_citation_rate_stats$tau_upperCI[group_counter] <- as.numeric(credibleIntervalKendallTau(group_citation_rate_stats$tau[group_counter], 26)[3]) # calculate upper Bayesian credible interval group_citation_rate_stats$bf[group_counter] <- bfCorrieKernelKendallTau(group_citation_rate_stats$tau[group_counter], 26)$bf10 # calculate Bayes factor for Kendall's tau of percent times cited ~ author group_citation_rate_stats$beta[group_counter] <- lm(percent_citation_rate ~ author_num, citation_letters_group)$coefficients[2] # calculate linear regression slope group_citation_rate_stats$intercept[group_counter] <- lm(percent_citation_rate ~ author_num, citation_letters_group)$coefficients[1] # calculate linear regression intercept # Create data frame of probability of citation statistics per group group_prob_cited_stats$tau[group_counter] <- cor.test(citation_letters_group$mean_prob_cited, citation_letters_group$author_num, method = "kendall")$estimate # extract Kendall correlation coefficient group_prob_cited_stats$tau_lowerCI[group_counter] <- as.numeric(credibleIntervalKendallTau(group_prob_cited_stats$tau[group_counter], 26)[1]) # calculate lower Bayesian credible interval group_prob_cited_stats$tau_upperCI[group_counter] <- as.numeric(credibleIntervalKendallTau(group_prob_cited_stats$tau[group_counter], 26)[3]) # calculate upper Bayesian credible interval group_prob_cited_stats$beta[group_counter] <- lm(mean_prob_cited ~ author_num, citation_letters_group)$coefficients[2] # calculate linear regression slope group_prob_cited_stats$intercept[group_counter] <- lm(mean_prob_cited ~ author_num, citation_letters_group)$coefficients[1] # calculate linear regression intercept group_prob_cited_stats$bf[group_counter] <- bfCorrieKernelKendallTau(group_prob_cited_stats$tau[group_counter], 26)$bf10 # calculate Bayes factor for Bayesian linear regression of percent times cited ~ author group_counter <- group_counter + 1 # increment counter } group_citation_rate_df <<- group_citation_rate_df[-1, ] # remove initiation row group_citation_rate_stats <<- group_citation_rate_stats # save citation rate statistics group_prob_cited_stats <<- group_prob_cited_stats # save probability of citation statistics } ### ## Calculate Bayes factor for Kendall's Tau (from van Doorn, J., Ly, A., Marsman, M., & Wagenmakers, E.-J. (in press). Bayesian inference for Kendall’s rank correlation coefficient. The American Statistician. https://doi.org/10.1080/00031305.2016.1264998). Script from https://osf.io/b9qhj/. ### # Prior specification Kendall's Tau scaledBetaTau <- function(tau, alpha=1, beta=1){ result <- ((pi*2^(-2*alpha))/beta(alpha,alpha)) * cos((pi*tau)/2)^(2*alpha-1) return(result) } priorTau <- function(tau, kappa){ scaledBetaTau(tau, alpha = (1/kappa), beta = (1/kappa)) } priorTauPlus <- function(tau, kappa=1) { non.negative.index <- tau >=0 less.than.one.index <- tau <=1 value.index <- as.logical(non.negative.index*less.than.one.index) result <- tau*0 result[value.index] <- 2*priorTau(tau[value.index], kappa) return(result) } priorTauMin <- function(tau, kappa=1) { negative.index <- tau <=0 greater.than.min.one.index <- tau >= -1 value.index <- as.logical(negative.index*greater.than.min.one.index) result <- tau*0 result[value.index] <- 2*priorTau(tau[value.index], kappa) return(result) } # Posterior specification Kendall's Tau postDensKendallTau <- function(delta,Tstar,n,kappa=1,var=var,test="two-sided"){ if(test == "two-sided"){priorDens <- priorTau(delta,kappa) } else if(test == "positive"){priorDens <- priorTauPlus(delta,kappa) } else if(test == "negative"){priorDens <- priorTauMin(delta,kappa)} priorDens <- priorTau(delta,kappa) dens <- dnorm(Tstar,(1.5*delta*sqrt(n)),sd=sqrt(var))* priorDens return(dens) } posteriorTau <- function(delta,kentau,n,kappa=1,var=1,test="two-sided"){ Tstar <- (kentau * ((n*(n-1))/2))/sqrt(n*(n-1)*(2*n+5)/18) var <- min(1,var) if(test == "two-sided"){lims <- c(-1,1) } else if(test == "positive"){lims <- c(0,1) } else if(test == "negative"){lims <- c(-1,0)} logicalCensor <- (delta >= lims[1] & delta <= lims[2]) dens <- logicalCensor*postDensKendallTau(delta,Tstar,n,kappa,var,test=test)/ integrate(function(delta){postDensKendallTau(delta,Tstar,n,kappa,var,test=test)},lims[1],lims[2])$value } # Bayes factor computation Kendall's Tau bfCorrieKernelKendallTau <- function(tau, n, kappa=1, var=1, ciValue=0.95){ tempList <- list(vector()) output <- list(n=n, r=tau, bf10=NA, bfPlus0=NA, bfMin0=NA) output$bf10 <- priorTau(0,kappa)/posteriorTau(0,tau,n,kappa=kappa,var=var,test="two-sided") output$bfPlus0 <- priorTauPlus(0,kappa)/posteriorTau(0,tau,n,kappa=kappa,var=var,test="positive") output$bfMin0 <- priorTauMin(0,kappa)/posteriorTau(0,tau,n,kappa=kappa,var=var,test="negative") return(output) } # Compute credible intervals kendalls tau credibleIntervalKendallTau <- function(kentau,n,kappa=1,var=1, test="two-sided", ciValue = 0.95){ nSeqs <- 1000 lowCI <- (1-ciValue)/2 upCI <- (1+ciValue)/2 taus <- seq(-1,1,length.out = (nSeqs-1)) densVals <- posteriorTau(taus, kentau, n, kappa = kappa, var = var, test = test) densVals <- cumsum((densVals[1:(nSeqs-1)]+densVals[2:nSeqs])*0.5*(taus[2]-taus[1])) lowerCI <- taus[which(densVals>=lowCI)[1]] upperCI <- taus[which(densVals>=upCI)[1]] median <- taus[which(densVals>=0.5)[1]] return(list(lowerCI = lowerCI, median = median, upperCI = upperCI)) } sampleTausA <- function(myTau,myN,nSamples = 3e3, var = 1){ nSeqs <- 1000 tauSamples <- NULL taus <- seq(-1,1,length.out = nSeqs) densVals <- posteriorTau(taus, myTau, myN, var = var) ceiling <- max(densVals) lowerB <- taus[which(round(densVals,digits=6) != 0 )][1] upperB <- rev(taus[which(round(densVals,digits=6) != 0 )])[1] while(length(tauSamples) < nSamples){ prop <- runif(1,lowerB,upperB) propDens <- posteriorTau(prop, myTau, myN, var = var) if(propDens > runif(1,0,ceiling)){tauSamples <- c(tauSamples,prop)} } return(tauSamples) } # Data set 1 ---------------------------------------------------------------- ### ## Input and process data file ### citation_data <- read_csv2("stevens_duque_2018_data.csv", progress = FALSE) # input data citation_data <- citation_data %>% filter(!is.na(citation_style)) # filter data_set 1 data citation_data1 <- citation_data %>% filter(data_set == 1) # filter data_set 1 data citation_data1 <- citation_data1 %>% mutate(author_num = as.numeric(as.factor(author)) - 1, year_num = year - 2000) # convert letters to numbers 0-25 and years to numbers 0-15 citation_data1$prob_cited <- ifelse(citation_data1$times_cited > 0, 1, 0) # convert times cited to probability of citation journal_names1 <- unique(citation_data1$journal) # create vector of journals citation_style_names1 <- unique(citation_data1$citation_style) # create vector of citation styles field_names1 <- unique(citation_data1$field) # create vector of fields ### ## Aggregate data for sample size calculations ### cited_df1 <- citation_data1 %>% group_by(journal, field, citation_style) %>% summarize(mean_citation_rate = round(mean(times_cited), 2), num_articles = length(times_cited)) # summarize citation count by journal, field, and citation style times_cited_overall_n1 <- format(sum(cited_df1$num_articles), big.mark = ",", trim=TRUE) # calculate overall number of articles times_cited_field_n1 <- format((cited_df1 %>% group_by(field) %>% summarize(num_articles = sum(num_articles)))$num_articles, big.mark = ",", trim=TRUE) # calculate number of articles for each field times_cited_citation_style_n1 <- format((cited_df1 %>% group_by(citation_style) %>% summarize(num_articles = sum(num_articles)))$num_articles, big.mark = ",", trim=TRUE)[1:3] # calculate number of articles for each citation style times_cited_journal1 <- cited_df1 %>% group_by(journal, field) %>% summarize(num_articles = sum(num_articles)) times_cited_journal1$n <- format(times_cited_journal1$num_articles, big.mark = ",", trim=TRUE) # calculate number of articles per journal ### ## Calculate effect of letter on citation rate per citation style ### # Calculate citation rate per citation style citation_rate_citation_style1 <- citation_data1 %>% group_by(citation_style, author, author_num) %>% summarize(mean_prob_cited = mean(prob_cited), mean_citation_rate = mean(times_cited)) # sum times cited by letter # Calculate correlations, slopes, intercepts, and Bayes factors for citation style calculateCorrelationPerGroup(citation_rate_citation_style1, citation_style_names1) citation_style_df1 <- group_citation_rate_df # save data citation_style_citation_rate_stats1 <- group_citation_rate_stats # save stats citation_style_prob_cited_stats1 <- group_prob_cited_stats # save stats # Calculate Bayesian linear regression comparing citation style * letter interaction to without interaction citation_style_null_bf1 <- lmBF(percent_citation_rate ~ author_num + group, data = citation_style_df1, progress = FALSE) # calculate Bayes factor for null model with just main effects citation_style_bf1 <- lmBF(percent_citation_rate ~ author_num * group, data = citation_style_df1, progress = FALSE) # calculate Bayes factor for null model with main effects and interaction citation_style_bf1 / citation_style_null_bf1 # calculate Bayes factor comparing interaction to null model # Plot mean citation rate per letter and citation style citation_style_tau_label1 <- paste("tau ==", round(citation_style_citation_rate_stats1$tau, 2)) times_cited_citation_style_ggplot1 <- ggplot(citation_style_df1, aes(x = author, y = percent_citation_rate)) + facet_grid(~group) + # for each group geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), citation_style_citation_rate_stats1, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels annotate("text", 13, 5.8, label = citation_style_tau_label1, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 5.5, label = paste("BF =", round(citation_style_citation_rate_stats1$bf, 2)), size = 18) + # add Bayes factor theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=40), strip.text.x = element_text(size = 60, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/citation_styles1.png"), width = 2000, height = 800) # open PNG device plot(times_cited_citation_style_ggplot1) # plot figure dev.off() # close device ### ## Calculate effect of letter on citation rate per field ### # Calculate citation rate per field citation_rate_field1 <- citation_data1 %>% group_by(field, author, author_num) %>% summarize(mean_prob_cited = mean(prob_cited), mean_citation_rate = mean(times_cited)) # sum times cited by letter # Calculate correlations, slopes, intercepts, and Bayes factors for citation style calculateCorrelationPerGroup(citation_rate_field1, field_names1) field_df1 <- group_citation_rate_df # save data field_citation_rate_stats1 <- group_citation_rate_stats # save stats field_prob_cited_stats1 <- group_prob_cited_stats # save stats field_df1$group <- factor(field_df1$group, levels = c("Psychology", "Biology")) # reorder fields field_citation_rate_stats1$group <- factor(field_citation_rate_stats1$group, levels = c("Psychology", "Biology")) # reorder fields field_prob_cited_stats1$group <- factor(field_prob_cited_stats1$group, levels = c("Psychology", "Biology")) # reorder fields # Calculate Bayesian linear regression comparing field * letter interaction to without interaction field_null_bf1 <- lmBF(percent_citation_rate ~ author_num + group, data = field_df1, progress = FALSE) # calculate Bayes factor for null model with just main effects field_bf1 <- lmBF(percent_citation_rate ~ author_num * group, data = field_df1, progress = FALSE) # calculate Bayes factor for null model with main effects and interaction field_bf1 / field_null_bf1 # calculate Bayes factor comparing interaction to null model # Plot mean citation rate per letter and field field_tau_label1 <- paste("tau ==", round(field_citation_rate_stats1$tau, 2)) times_cited_field_ggplot1 <- ggplot(field_df1, aes(x = author, y = percent_citation_rate)) + facet_grid(~group) + # for each group geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), field_citation_rate_stats1, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels annotate("text", 13, 5.8, label = field_tau_label1, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 5.5, label = paste("BF =", round(field_citation_rate_stats1$bf, 2)), size = 18) + # add Bayes factor theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=40), strip.text.x = element_text(size = 60, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/fields1.png"), width = 1400, height = 800) # open PNG device plot(times_cited_field_ggplot1) # plot figure dev.off() # close device ### ## Calculate effect of letter on citation rate per journal ### # Calculate citation rate per journal citation_rate_journal1 <- citation_data1 %>% group_by(journal, author, author_num) %>% summarize(mean_prob_cited = mean(prob_cited), mean_citation_rate = mean(times_cited)) # sum times cited by letter and journal # Calculate correlations, slopes, intercepts, and Bayes factors for citation style calculateCorrelationPerGroup(citation_rate_journal1, journal_names1) journal_df1 <- group_citation_rate_df # save data journal_citation_rate_stats1 <- group_citation_rate_stats # save stats journal_df1 <- merge(journal_df1, times_cited_journal1[, c("journal", "field")], by.x = "group", by.y = "journal") # add field column journal_citation_rate_stats1 <- merge(journal_citation_rate_stats1, times_cited_journal1[, c("journal", "field")], by.x = "group", by.y = "journal") # add field column ## Psychology data # Prepare data psychology_journal_labels1 <- c("American Psychologist", "Animal Cognition", "Annual Review of Psychology", "Current Directions in\nPsychological Science", "Journal of Comparative Psychology", "Journal of Experimental Psychology\nAnimal Learning and Cognition", "Journal of the\nExperimental Analysis of Behavior", "Learning and Behavior", "Perspectives on\nPsychological Science", "Psychological Bulletin", "Psychological Review", "Psychological Science", "Trends in Cognitive Sciences") # create vector of psychology journal names # Filter psychology journals psychology_journal_df1 <- filter(journal_df1, field == "Psychology") # filter out psychology journals psychology_journal_df1$journal <- factor(psychology_journal_df1$group, labels = psychology_journal_labels1) # relabel journals psychology_citation_rate_stats1 <- filter(journal_citation_rate_stats1, field == "Psychology") # filter out psychology journals for statistics psychology_citation_rate_stats1$journal <- psychology_journal_labels1 # relabel journals psychology_journal_stats1 <- filter(times_cited_journal1, field == "Psychology") # filter out psychology journals # Plot mean citation rate per psychology journal psychology_tau_label1 <- paste("tau ==", round(psychology_citation_rate_stats1$tau, 2)) psychology_ggplot1 <- ggplot(psychology_journal_df1, aes(x = author, y = percent_citation_rate)) + facet_wrap(~journal) + # for each journal geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), psychology_citation_rate_stats1, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels annotate("text", 13, 11, label = psychology_tau_label1, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 9.5, label = paste("BF =", round(psychology_citation_rate_stats1$bf, 1)), size = 18) + # add Bayes factor annotate("text", 13, 8, label = paste("N =", psychology_journal_stats1$n), size = 18) + # add tau value theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=30), strip.text.x = element_text(size = 25, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/psychology_journals1.png"), width = 1800, height = 2200) # open PNG device plot(psychology_ggplot1) # plot figure dev.off() # close device ## Biology data # Prepare data biology_journal_labels1 <- c("Animal Behaviour", "Annual Review of\nEcology, Evolution,and Systematics", "Behavioral Ecology and Sociobiology", "Behavioral Ecology", "Behaviour", "Biological Reviews", "Ethology", "Journal of Ethology", "Philosophical Transactions\nof the Royal Society", "PLOS Biology", "Proceedings of the Royal Society", "Quarterly Review of Biology", "Trends in Ecology and Evolution") # create vector of biology journal names # Filter biology journals biology_journal_df1 <- filter(journal_df1, field == "Biology") # filter out biology journals biology_journal_df1$journal <- factor(biology_journal_df1$group, labels = biology_journal_labels1) # relabel journals biology_citation_rate_stats1 <- filter(journal_citation_rate_stats1, field == "Biology") # filter out biology journals for statistics biology_citation_rate_stats1$journal <- biology_journal_labels1 # relabel journals biology_journal_stats1 <- filter(times_cited_journal1, field == "Biology") # filter out psychology journals # Plot mean citation rate per biology journal biology_tau_label1 <- paste("tau ==", round(biology_citation_rate_stats1$tau, 2)) biology_ggplot1 <- ggplot(biology_journal_df1, aes(x = author, y = percent_citation_rate)) + facet_wrap(~journal) + # for each journal geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), biology_citation_rate_stats1, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels annotate("text", 13, 15, label = biology_tau_label1, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 13, label = paste("BF =", round(biology_citation_rate_stats1$bf, 1)), size = 18) + # add Bayes factor annotate("text", 13, 11, label = paste("N =", biology_journal_stats1$n), size = 18) + # add N theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=30), strip.text.x = element_text(size = 25, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/biology_journals1.png"), width = 1800, height = 2200) # open PNG device plot(biology_ggplot1) # plot figure dev.off() # close device # Data set 2 ----------------------------------------------- ### ## Input and process data file ### citation_data2 <- citation_data %>% filter(data_set == 2) # filter data_set 1 data citation_data2 <- citation_data2 %>% mutate(author_num = as.numeric(as.factor(author)) - 1, year_num = year - 2000) # convert letters to numbers 0-25 and years to numbers 0-15 citation_data2$prob_cited <- ifelse(citation_data2$times_cited > 0, 1, 0) # convert times cited to probability of citation field_names2 <- unique(citation_data2$field) # create vector of fields ### ## Aggregate data for sample size calculations ### cited_df2 <- citation_data2 %>% group_by(journal, field, citation_style) %>% summarize(mean_citation_rate = round(mean(times_cited), 2), num_articles = length(times_cited)) # summarize citation count by journal, field, and citation style times_cited_overall_n2 <- format(sum(cited_df2$num_articles), big.mark = ",", trim=TRUE) # calculate overall number of articles times_cited_field_n2 <- format((cited_df2 %>% group_by(field) %>% summarize(num_articles = sum(num_articles)))$num_articles, big.mark = ",", trim=TRUE) # calculate number of articles for each field times_cited_journal2 <- cited_df2 %>% group_by(journal, field) %>% summarize(num_articles = sum(num_articles)) # calculate number of articles per journal times_cited_journal2$n <- format(times_cited_journal2$num_articles, big.mark = ",", trim=TRUE) # format number of articles journal_names2 <- unique(citation_data2$journal) # create vector of journals ### ## Calculate effect of letter on citation rate per field ### # Calculate citation rate per field citation_rate_field2 <- citation_data2 %>% group_by(field, author, author_num) %>% summarize(mean_prob_cited = mean(prob_cited), mean_citation_rate = mean(times_cited)) # sum times cited by letter field_df2 <- field_df_nooutliers2 <- data.frame(matrix(rep(NA, 6), ncol = 6)) # initiate data frame for author data # Calculate correlations, slopes, intercepts, and Bayes factors for citation style calculateCorrelationPerGroup(citation_rate_field2, field_names2) field_df2 <- group_citation_rate_df # save data field_citation_rate_stats2 <- group_citation_rate_stats # save stats field_prob_cited_stats2 <- group_prob_cited_stats # save stats field_df2$group <- factor(field_df2$group, levels = c("Psychology", "Geoscience")) # reorder fields field_citation_rate_stats2$group <- factor(field_citation_rate_stats2$group, levels = c("Psychology", "Geoscience")) # reorder fields field_prob_cited_stats2$group <- factor(field_prob_cited_stats2$group, levels = c("Psychology", "Geoscience")) # reorder fields # Calculate Bayesian linear regression comparing field * letter interaction to without interaction field_null_bf2 <- lmBF(percent_citation_rate ~ author_num + group, data = field_df2, progress = FALSE) # calculate Bayes factor for null model with just main effects field_bf2 <- lmBF(percent_citation_rate ~ author_num * group, data = field_df2, progress = FALSE) # calculate Bayes factor for null model with main effects and interaction field_bf2 / field_null_bf2 # calculate Bayes factor comparing interaction to null model # Plot mean citation rate per letter and field field_tau_label2 <- paste("tau ==", round(field_citation_rate_stats2$tau, 2)) times_cited_field_ggplot2 <- ggplot(field_df2, aes(x = author, y = percent_citation_rate)) + facet_grid(~group) + # for each group geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), field_citation_rate_stats2, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels annotate("text", 13, 6, label = field_tau_label2, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 5.7, label = paste("BF =", round(field_citation_rate_stats2$bf, 2)), size = 18) + # add Bayes factor theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=40), strip.text.x = element_text(size = 60, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/fields2.png"), width = 1400, height = 800) # open PNG device plot(times_cited_field_ggplot2) # plot figure dev.off() # close device ### ## Calculate effect of letter on citation rate per journal ### # Calculate citation rate per journal citation_rate_journal2 <- citation_data2 %>% group_by(journal, author, author_num) %>% summarize(mean_prob_cited = mean(prob_cited), mean_citation_rate = mean(times_cited)) # sum times cited by letter and journal # Calculate correlations, slopes, intercepts, and Bayes factors for citation style calculateCorrelationPerGroup(citation_rate_journal2, journal_names2) journal_df2 <- group_citation_rate_df # save data journal_citation_rate_stats2 <- group_citation_rate_stats # save stats journal_df2 <- merge(journal_df2, times_cited_journal2[, c("journal", "field")], by.x = "group", by.y = "journal") # add field column journal_citation_rate_stats2 <- merge(journal_citation_rate_stats2, times_cited_journal2[, c("journal", "field")], by.x = "group", by.y = "journal") # add field column ## Psychology data # Prepare data psychology_journal_labels2 <- c("American Journal \nof Orthopsychiatry", "Asian American Journal \nof Psychology", "Attention Perception \n& Psychophysics", "Behavioral Neuroscience", "Behavior Research Methods", "Canadian Journal of \nExperimental Psychology", "Canadian Psychology-\nPsychologie Canadienne", "Cognitive Affective & \nBehavioral Neuroscience", "Contemporary Psychology-\nAPA Review of Books", "Cultural Diversity & \nEthnic Minority Psychology", "Developmental Psychology", "Dreaming", "Emotion", "Experimental and Clinical \nPsychopharmacology", "Group Dynamics-Theory \nResearch and Practice", "Health Psychology", "History of Psychology", "Journal of \nAbnormal Psychology", "Journal of Applied Psychology", "Journal of Consulting and \nClinical Psychology", "Journal of \nCounseling Psychology", "Journal of Diversity \nin Higher Education", "Journal of \nEducational Psychology", "Journal of Experimental \nPsychology-Applied", "Journal of Experimental \nPsychology-General", "Journal of Experimental \nPsychology-HPP", "Journal of Experimental \nPsychology-LMC", "Journal of Family Psychology", "Journal of Occupational \nHealth Psychology", "Journal of Personality \nand Social Psychology", "Law and Human Behavior", "Memory & Cognition", "Military Psychology", "Neuropsychology", "Personality Disorders-Theory \nResearch and Treatment", "Professional Psychology-\nResearch and Practice", "Psychiatric Rehabilitation \nJournal", "Psychoanalytic Psychology", "Psychological Assessment", "Psychological Methods", "Psychological Services", "Psychological Trauma-Theory \nResearch Practice and Policy", "Psychology and Aging", "Psychology of \nAddictive Behaviors", "Psychology of Aesthetics \nCreativity and the Arts", "Psychology of Men \n& Masculinity", "Psychology of Religion \nand Spirituality", "Psychology of Violence", "Psychology Public Policy \nand Law", "Psychotherapy", "Rehabilitation Psychology", "Review of General Psychology", "School Psychology Quarterly", "Training and Education in \nProfessional Psychology") # Filter psychology journals psychology_journal_df2 <- filter(journal_df2, field == "Psychology") # filter out psychology journals psychology_journal_df2$journal <- factor(psychology_journal_df2$group, labels = psychology_journal_labels2) # relabel journals psychology_citation_rate_stats2 <- filter(journal_citation_rate_stats2, field == "Psychology") # filter out psychology journals for statistics psychology_citation_rate_stats2$journal <- psychology_journal_labels2 # relabel journals psychology_journal_stats2 <- filter(times_cited_journal2, field == "Psychology") # filter out psychology journals # Plot mean citation rate per psychology journal psychology_tau_label2 <- paste("tau ==", round(psychology_citation_rate_stats2$tau, 2)) psychology_ggplot2 <- ggplot(psychology_journal_df2, aes(x = author, y = percent_citation_rate)) + facet_wrap(~journal) + # for each journal geom_point(size = 5, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), psychology_citation_rate_stats2, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels ylim(0, 15) + annotate("text", 13, 14, label = psychology_tau_label2, size = 12, parse = TRUE) + # add tau value annotate("text", 13, 12, label = paste("BF =", round(psychology_citation_rate_stats2$bf, 1)), size = 12) + # add Bayes factor annotate("text", 13, 10, label = paste("N =", psychology_journal_stats2$n), size = 12) + # add N theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text.x=element_text(size=10), axis.text.y=element_text(size=20), strip.text.x = element_text(size = 15, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/psychology_journals2.png"), width = 1800, height = 2200) # open PNG device plot(psychology_ggplot2) # plot figure dev.off() # close device ## Geoscience data # Prepare data geoscience_journal_labels2 <- c("Atmospheric Science Letters", "Bulletin of the Seismological \nSociety of America", "Earth Interactions", "Environmental & \nEngineering Geoscience", "Geochemistry-Exploration \nEnvironment Analysis", "Geological Society of \nAmerica Bulletin", "Geology", "Geosphere", "International Journal \nof Climatology", "Italian Journal of Geosciences", "Journal of Applied \nMeteorology and Climatology", "Journal of Atmospheric \nand Oceanic Technology", "Journal of Climate", "Journal of Hydrometeorology", "Journal of \nMicropalaeontology", "Journal of \nPhysical Oceanography", "Journal of the \nAtmospheric Sciences", "Journal of the \nGeological Society", "Lithosphere", "Meteorological Applications", "Monthly Weather Review", "Petroleum Geoscience", "Proceedings of the \nYorkshire Geological Society", "Quarterly Journal of Engineering \nGeology and Hydrogeology", "Quarterly Journal of the \nRoyal Meteorological Society", "Scottish Journal of Geology", "Seismological Research Letters", "South African Journal of Geology", "Vadose Zone Journal", "Weather and Forecasting", "Weather Climate and Society") # Filter geoscience journals geoscience_journal_df2 <- filter(journal_df2, field == "Geoscience") # filter out geoscience journals geoscience_journal_df2$journal <- factor(geoscience_journal_df2$group, labels = geoscience_journal_labels2) # relabel journals geoscience_citation_rate_stats2 <- filter(journal_citation_rate_stats2, field == "Geoscience") # filter out geoscience journals for statistics geoscience_citation_rate_stats2$journal <- geoscience_journal_labels2 # relabel journals geoscience_journal_stats2 <- filter(times_cited_journal2, field == "Geoscience") # filter out psychology journals # Plot mean citation rate per geoscience journal geoscience_tau_label2 <- paste("tau ==", round(geoscience_citation_rate_stats2$tau, 2)) geoscience_ggplot2 <- ggplot(geoscience_journal_df2, aes(x = author, y = percent_citation_rate)) + facet_wrap(~journal) + # for each journal geom_point(size = 5, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), geoscience_citation_rate_stats2, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean citation rate") + # create axis labels ylim(0, 15) + annotate("text", 13, 14, label = geoscience_tau_label2, size = 12, parse = TRUE) + # add tau value annotate("text", 13, 12, label = paste("BF =", round(geoscience_citation_rate_stats2$bf, 1)), size = 12) + # add Bayes factor annotate("text", 13, 10, label = paste("N =", geoscience_journal_stats2$n), size = 12) + # add N theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=18), strip.text.x = element_text(size = 18, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/geoscience_journals2.png"), width = 1800, height = 2200) # open PNG device plot(geoscience_ggplot2) # plot figure dev.off() # close device # Follow-up analysis ----------------------------------------------- ### # Probability of citation ### # Plot mean probability of citation per letter and citation style for data set 1 prob_style_tau_label1 <- paste("tau ==", round(citation_style_prob_cited_stats1$tau, 2)) prob_cited_citation_style_ggplot1 <- ggplot(citation_style_df1, aes(x = author, y = mean_prob_cited)) + facet_grid(~group) + # for each group geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), citation_style_prob_cited_stats1, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean probability of citation") + # create axis labels annotate("text", 13, 1, label = prob_style_tau_label1, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 0.985, label = paste("BF =", round(citation_style_prob_cited_stats1$bf, 2)), size = 18) + # add Bayes factor theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=40), strip.text.x = element_text(size = 60, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/citation_styles_prob1.png"), width = 2000, height = 1000) # open PNG device plot(prob_cited_citation_style_ggplot1) # plot figure dev.off() # close device # Plot mean probability of citation per letter and field for data set 1 prob_field_tau_label1 <- paste("tau ==", round(field_prob_cited_stats1$tau, 2)) prob_cited_field_ggplot1 <- ggplot(field_df1, aes(x = author, y = mean_prob_cited)) + facet_grid(~group) + # for each group geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), field_prob_cited_stats1, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean probability of citation") + # create axis labels annotate("text", 13, 0.99, label = prob_field_tau_label1, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 0.976, label = paste("BF =", round(field_prob_cited_stats1$bf, 2)), size = 18) + # add Bayes factor theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=40), strip.text.x = element_text(size = 60, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/fields_prob1.png"), width = 2000, height = 1000) # open PNG device plot(prob_cited_field_ggplot1) # plot figure dev.off() # close device # Plot mean probability of citation per letter and field for data set 2 prob_field_tau_label2 <- paste("tau ==", round(field_prob_cited_stats2$tau, 2)) prob_cited_field_ggplot2 <- ggplot(field_df2, aes(x = author, y = mean_prob_cited)) + facet_grid(~group) + # for each group geom_point(size = 7, color = "grey50") + # set point size geom_abline(aes(slope = beta, intercept = intercept), field_prob_cited_stats2, color = "black", size = 2) + # create linear regression line labs(x = "First letter of author's last name",y = "Mean probability of citation") + # create axis labels annotate("text", 13, 0.99, label = prob_field_tau_label2, size = 18, parse = TRUE) + # add tau value annotate("text", 13, 0.971, label = paste("BF =", round(field_prob_cited_stats2$bf, 2)), size = 18) + # add Bayes factor theme_bw() + # use theme theme(axis.title=element_text(size=70), axis.text=element_text(size=40), strip.text.x = element_text(size = 60, margin = margin(0.5, 0, 0.5, 0, "cm")), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/fields_prob2.png"), width = 2000, height = 1000) # open PNG device plot(prob_cited_field_ggplot2) # plot figure dev.off() # close device # Article-level analyses ----------------------------------------------- #### ## Prepare data #### citation_data_all <- rbind(citation_data1, citation_data2) # combine data sets citation_data_all_nozero <- subset(citation_data_all, times_cited > 0) # remove data with no citations citation_data_all_nozero$field <- factor(citation_data_all_nozero$field, levels = c("Psychology", "Biology", "Geoscience")) #### # Conduct LMM with log-transformed citation counts #### ## Full model includes interactions between letter and citation style and letter and field (but not citation style and field) ## Investigate citation style and field interactions with letter citation_data_all_nozero$log_times_cited <- log(citation_data_all_nozero$times_cited) # log transform citation counts letter_field_year_journal_lmer <- lmer(log_times_cited ~ author_num * field + (1 | year_num) + (1 | journal), data = citation_data_all_nozero, REML = FALSE) # fit letter and field interaction letter_field_nointer_year_journal_lmer <- lmer(log_times_cited ~ author_num + field + (1 | year_num) + (1 | journal), data = citation_data_all_nozero, REML = FALSE) # fit letter and field main effects (no interaction) bic_bf10(BIC(letter_field_nointer_year_journal_lmer), BIC(letter_field_year_journal_lmer)) # Calculate model comparison Bayes factors for field by letter interaction ## Assess whether random slopes should be included letter_field_year_journal_rs.journal_lmer <- lmer(log_times_cited ~ author_num * field + (author_num | journal) + (1 | year_num), data = citation_data_all_nozero, REML = FALSE) # fit random slopes for journal anova(letter_field_year_journal_lmer, letter_field_year_journal_rs.journal_lmer) # conduct likelihood ratio test comparing this model with model without random slopes bic_bf10(BIC(letter_field_year_journal_lmer), BIC(letter_field_year_journal_rs.journal_lmer)) # compare Bayes factors for this model and model without random slopes # Likelihood ratio tests suggests that including random slopes is not warranted at p = 0.112 (BF = 0.000067) letter_field_year_journal_rs.year_lmer <- lmer(log_times_cited ~ author_num * field + (1 | journal) + (author_num | year_num), data = citation_data_all_nozero, REML = FALSE) # fit random slopes for year anova(letter_field_year_journal_lmer, letter_field_year_journal_rs.year_lmer) # conduct likelihood ratio test comparing this model with model without random slopes bic_bf10(BIC(letter_field_year_journal_lmer), BIC(letter_field_year_journal_rs.year_lmer)) # compare Bayes factors for this model and model without random slopes # Likelihood ratio tests suggests that including random slopes is not warranted at p = 0.972 (BF = 0.0000077) #### # Iterate LMMs across quantiles #### ## Prepare data num_quantiles <- 10 # assign number of quantiles quantile_index <- 1:num_quantiles # create index of quantiles field_quantiles_bf <- quant_min <- quant_max <- NA # initiate vectors of Bayes factors and beta coefficients for citation style and field for each quantile ## Iterate LMMs for each quantile foreach(quant = quantile_index) %do% { # for each quantile # Extract top quantile data for each field psych_top <- extractTopQuantile(subset(citation_data_all_nozero, field == "Psychology")) bio_top <- extractTopQuantile(subset(citation_data_all_nozero, field == "Biology")) geo_top <- extractTopQuantile(subset(citation_data_all_nozero, field == "Geoscience")) all_top <- rbind(psych_top, bio_top) all_top <- rbind(all_top, geo_top) all_top$log_times_cited <- log(all_top$times_cited) # Conduct LMM for full model and model without letter by citation style interaction letter_field_journal_year_iter_lmer <- lmer(log_times_cited ~ author_num * field + (1 | journal) + (1 | year_num), data = all_top, REML = FALSE) letter_field_nointer_journal_year_iter_lmer <- lmer(log_times_cited ~ author_num + field + (1 | journal) + (1 | year_num), data = all_top, REML = FALSE) # Calculate Bayes factor for full/no interaction comparison field_quantiles_bf[quant] <- bic_bf10(BIC(letter_field_nointer_journal_year_iter_lmer), BIC(letter_field_journal_year_iter_lmer)) # Extract minimum citation count per letter letter_mins <- all_top %>% group_by(author) %>% summarise(min_times_cited = min(times_cited)) quant_min[quant] <- min(letter_mins$min_times_cited) # find smallest minimum citation count quant_max[quant] <- max(letter_mins$min_times_cited) # find largest minimum citation count } # Create dataframe of quantile data quantile_data <- tibble(quantile_values = seq(from = 0, to = 90, by = 10), # create column of quantiles field_bf = field_quantiles_bf, # create column of Bayes factors min_times_cited = quant_min, # create column of minimum times cited max_times_cited = quant_max) %>% # create column of maximum times cited mutate(ranges = paste("[", min_times_cited, ",", max_times_cited, "]", sep = ""), xaxis = c(0.335, 0.405, 0.475, 0.544, 0.615, 0.683, 0.753, 0.823, 0.892, 0.962) ) ## Plot BFs as a function of percentile y_axis_breaks <- c(1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6) # create vector of y-axis breaks # Plot figure field_bf_plot <- ggplot(quantile_data, aes(x = quantile_values, y = field_bf)) + geom_rect(xmin = -Inf, xmax = Inf, ymin = log10(1/3), ymax = log10(3), fill = "grey80") + # plot shaded rectangle for area with no evidence geom_point(size = 9) + # plot points scale_y_log10(breaks = y_axis_breaks, labels = comma(y_axis_breaks)) + # create y-axis breaks scale_x_continuous(breaks = seq(0, 90, 10)) + # create x-axis breaks geom_hline(yintercept = 3) + # plot line for BF = 3 geom_hline(yintercept = 10) + # plot line for BF = 10 geom_hline(yintercept = 30) + # plot line for BF = 30 geom_hline(yintercept = 100) + # plot line for BF = 100 geom_hline(yintercept = 1/3, linetype = 2) + # plot line for BF = 1/3 geom_hline(yintercept = 1/10, linetype = 2) + # plot line for BF = 1/10 geom_hline(yintercept = 1/30, linetype = 2) + # plot line for BF = 1/30 geom_hline(yintercept = 1/100, linetype = 2) + # plot line for BF = 1/100 # geom_text(aes(label = ranges, x = quantile_values, y = 0.000003), size = 8.5) + labs(x = "Percentile", y = "Bayes factor") + # label axes theme_classic() + # use theme theme(axis.title.x = element_text(size = 70, margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(size = 70), axis.text=element_text(size = 50), plot.margin = grid::unit(c(0.5, 0.5, 0.5, 0.5), "lines")) # set font sizes and buffers png(filename = paste("figures/field_bf.png"), width = 1200, height = 1000) # open PNG device plot(field_bf_plot) # plot figure for(i in 1:nrow(quantile_data)) { grid.text(quantile_data$ranges[i], x = quantile_data$xaxis[i], y = unit(3.2, "lines"), gp = gpar(fontsize = 25, col = "grey30")) } dev.off() # close device