In [2]:
from utils import *
df = get_dataframe(True)

In [None]:
df['model'].value_counts()

Explanation of the different columns in df:

model: condition identifier

model_name: model identifier

task_id: task set identifier

interface: interface identifier autocomplete, chat or nomodel

aihelpful: how helpful the AI was (rating int)

mean_task_duration: mean time taken to complete a task

n_tasks_completed: number of tasks completed

sugg_accept_rate: average suggestion acceptance rate

n_sugg_requested: number of suggestions requested

sugg_accept_rate_requested: suggestion acceptance rate for requested suggestions

sugg_accept_rate_non_requested: suggestion acceptance rate for non-requested suggestions

n_assistant_response: number of chatbot messages

n_copy_code_button: times the copy code button was clicked

n_copy_from_chat: times code was copied from chat of AI

avg_copy_per_response: average number of times code was copied per chatbot message

prog_experience: programming experience from google form

python_experience: python experience from google form

ai_experience: AI experience from google form

task_data: for all tasks attempted, the final code that was submitted, whether the task was 
completed, and the time taken to complete the task, and if skipped

code_history: for all tasks attempted, the code history of the task

In [4]:
plt.rcParams.update({'font.size': 20})

# Section 4

Visualizing results from two papers: *Code llama: Open foundation models for code* and *Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation*.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams.update({'font.size': 20})

plt.figure(figsize=(13,6))


color1 = (0.2,0.4,0.2,0.8) 
color2 = (0.1,0.4,0.2,1) 
color3 = (0.2,0.4,0.7,0.8) 
color4 = (0.2,0.4,0.7,1) 
color5 = (0.6,0.2,0.6,0.8) 
color6 = (0.8,0.2,0.6,1) 
color7 = (0.8,0.8,0.2,1)
colors = [color1, color2, color3, color4, color5, color6, color7]



species = ("HumanEval", "MBPP")
penguin_means = {
    'CodeLlama7b': (33.5, 41.4),
    'CodeLlama7b (chat)': (34.8, 44.4),
    'CodeLlama34b': (48.8, 55.0),
    'CodeLlama34b (chat)': (41.5, 57.0),
    'GPT-3.5 ': (80.34, 81.03),
    'GPT-3.5 (chat)': (77.0, 82.91),
    'GPT-4 (chat)': (90.2, 85.7),
}


x = np.arange(len(species))  # the label locations
width = 0.15  # the width of the bars
multiplier = 0


for attribute, measurement in penguin_means.items():
    offset = width * multiplier
    hatch_pattern = '/' if 'chat' in attribute else ''  # Apply hatch pattern if 'chat' is in the label
    rects = plt.bar(x + offset, measurement, width, label=attribute, color=colors[multiplier], hatch=hatch_pattern)
    multiplier += 1


# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('Pass@1')
plt.xticks(x + 2.5*width, species)
# plt.legend(loc='bottom', ncols=3)
plt.ylim(20, 91)

plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
          fancybox=True, shadow=True, ncol=3)
plt.savefig('figures/benchmark.pdf', bbox_inches='tight')


# Section 5 stats

In [None]:
print(f'number of participants {len(df)}')

print("Percentage of participants by condition:", df['model'].value_counts(normalize=True) * 100)
print("Count of participants by condition:", df['model'].value_counts())
print('Total number of tasks completed', sum(df['n_tasks_completed']))
print("Summary statistics for the number of tasks completed:",df['n_tasks_completed'].describe())

print('total number of suggestions shown',  sum(df['n_sugg_shown']))
print('total number of suggestions accepted', sum(df['n_sugg_accepted']) )
print('percentage of suggestions accepted', sum(df['n_sugg_accepted']) / sum(df['n_sugg_shown']))


print('total number of assistant responses', sum(df['n_assistant_response']))
print('total number of copied messages from responses', sum(df['n_copy_from_chat']))
df['copied_per_response'] = np.where(df['avg_copy_per_response'] > 0, 1, 0)
print('average copy rate from chat response', np.nanmean(df['copied_per_response']))


# Section 5.1


### Task duration and tasks completed bar plot

In [7]:
# ignore nans
mean_duration_by_model = df.groupby('model_name')['mean_task_duration'].mean()
stderr_duration_by_model = df.groupby('model_name')['mean_task_duration'].sem()
task_duration_values_by_model = df.groupby('model_name')['mean_task_duration'].apply(list)


task_completion_time_df = pd.DataFrame({
    'mean': mean_duration_by_model,
    'se': stderr_duration_by_model,
    'values': task_duration_values_by_model
})


mean_duration_by_model = df.groupby('model_name')['n_tasks_completed'].mean()
stderr_duration_by_model = df.groupby('model_name')['n_tasks_completed'].sem()
tasks_completed_values_by_model = df.groupby('model_name')['n_tasks_completed'].apply(list)

tasks_completed_df = pd.DataFrame({
    'mean': mean_duration_by_model,
    'se': stderr_duration_by_model,
    'values': tasks_completed_values_by_model
})

task_completion_time_df = task_completion_time_df.reindex(['No LLM', 'CodeLlama7b', 'CodeLlama7b (chat)', 'CodeLlama34b', 'CodeLlama34b (chat)', 'GPT-3.5', 'GPT-3.5 (chat)','GPT-4o (chat)'])
tasks_completed_df = tasks_completed_df.reindex(['No LLM', 'CodeLlama7b', 'CodeLlama7b (chat)', 'CodeLlama34b', 'CodeLlama34b (chat)', 'GPT-3.5', 'GPT-3.5 (chat)', 'GPT-4o (chat)'])

In [None]:
import scipy.stats as stats
plt.rcParams.update({'font.size': 20})

plt.figure(figsize=(7,7))

color0 = (0,0,0,0.8)
color1 = (0.2,0.4,0.2,0.8) 
color2 = (0.1,0.4,0.2,1) 
color3 = (0.2,0.4,0.7,0.8) 
color4 = (0.2,0.4,0.7,1) 
color5 = (0.6,0.2,0.6,0.8) 
color6 = (0.8,0.2,0.6,1) 
# yellow
color7 = (0.8,0.8,0.2,1)
colors = [color0, color1, color2, color3, color4, color5, color6, color7]

x = 0 # the label locations
width = 0.17  # the width of the bars
multiplier = 0


for i in range(len(task_completion_time_df)):
    offset = width * multiplier  
    name = task_completion_time_df.iloc[i].name
    hatch_pattern = '/' if 'chat' in name else ''  # Apply hatch pattern if 'chat' is in the label
    measurement = task_completion_time_df.iloc[i]['mean']
    stderr = task_completion_time_df.iloc[i]['se']
    rects = plt.bar(x + offset , measurement, width - 0.02, label=name, color=colors[multiplier],hatch=hatch_pattern)
    # add stderr
    plt.errorbar(x + offset , measurement, stderr, fmt='none', ecolor='black', capsize=5, capthick=2)
    # get percentage improvement in measurement over No LLM
    improvement = (measurement - task_completion_time_df.loc['No LLM']['mean']) / task_completion_time_df.loc['No LLM']['mean'] * 100
    # add text
    null_values = tasks_completed_df.loc['No LLM']['values']
    alt_values = tasks_completed_df.loc[name]['values']
    t, p = stats.ttest_ind(null_values, alt_values)
    sign = "+" if improvement >= 0 else "-"
    if name != "No LLM":
        if p < 0.05:
            plt.text(x + offset, measurement + 45, f"{sign}{abs(improvement):.0f}%*", ha='center', va='bottom', fontsize=14)
        else:
            plt.text(x + offset, measurement + 45, f"{sign}{abs(improvement):.0f}%", ha='center', va='bottom', fontsize=14)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('Task Duration (s)')
plt.xlabel("Condition")
# plt.legend(loc='bottom', ncols=3)
plt.ylim(100, 520)
plt.xticks([0], [''])

plt.savefig('figures/task_duration_barplot_no_legend.pdf', bbox_inches='tight')

#plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fancybox=True, shadow=True, ncol=1)



In [None]:

plt.rcParams.update({'font.size': 20})

plt.figure(figsize=(7,7))
x = 0 # the label locations
width = 0.17  # the width of the bars
multiplier = 0


for i in range(len(tasks_completed_df)):
    offset = width * multiplier  
    name = tasks_completed_df.iloc[i].name
    hatch_pattern = '/' if 'chat' in name else ''  # Apply hatch pattern if 'chat' is in the label
    measurement = tasks_completed_df.iloc[i]['mean']
    stderr = tasks_completed_df.iloc[i]['se']
    rects = plt.bar(x + offset , measurement, width - 0.02, label=name, color=colors[multiplier],hatch=hatch_pattern)
    # add stderr
    plt.errorbar(x + offset , measurement, stderr, fmt='none', ecolor='black', capsize=5, capthick=2)
    # get percentage improvement in measurement over No LLM
    improvement = (measurement - tasks_completed_df.loc['No LLM']['mean']) / tasks_completed_df.loc['No LLM']['mean'] * 100
    # add text
    # perform statistcal test

    sign = "+" if improvement >= 0 else "-"
    if name != "No LLM":
        plt.text(x + offset, measurement +0.5, f"{sign}{abs(improvement):.0f}%", ha='center', va='bottom', fontsize=14)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('Tasks Completed')
plt.xlabel("Condition")
#plt.legend(loc='bottom')
plt.ylim(0, 5.2)
plt.xticks([0], [''])
plt.yticks([0, 1, 2, 3, 4, 5])
#plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fancybox=True, shadow=True, ncol=1)
plt.savefig('figures/tasks_completed_barplot.pdf', bbox_inches='tight')


In [None]:

plt.rcParams.update({'font.size': 20})

plt.figure(figsize=(7,7))
x = 0 # the label locations
width = 0.17  # the width of the bars
multiplier = 0


for i in range(len(tasks_completed_df)):
    offset = width * multiplier  
    name = tasks_completed_df.iloc[i].name
    hatch_pattern = '/' if 'chat' in name else ''  # Apply hatch pattern if 'chat' is in the label
    measurement = tasks_completed_df.iloc[i]['mean']
    stderr = tasks_completed_df.iloc[i]['se']
    rects = plt.bar(x + offset , measurement, width - 0.02, label=name, color=colors[multiplier],hatch=hatch_pattern)
    # add stderr
    plt.errorbar(x + offset , measurement, stderr, fmt='none', ecolor='black', capsize=5, capthick=2)
    # get percentage improvement in measurement over No LLM
    improvement = (measurement - tasks_completed_df.loc['No LLM']['mean']) / tasks_completed_df.loc['No LLM']['mean'] * 100
    # add text
    # perform statistcal test

    sign = "+" if improvement >= 0 else "-"
    if name != "No LLM":
        plt.text(x + offset, measurement +0.5, f"{sign}{abs(improvement):.0f}%", ha='center', va='bottom', fontsize=14)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
plt.ylabel('Tasks Completed')
plt.xlabel("Condition")
#plt.legend(loc='bottom')
plt.ylim(0, 5.2)
plt.xticks([0], [''])
plt.yticks([0, 1, 2, 3, 4, 5])
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fancybox=True, shadow=True, ncol=1)
plt.savefig('figures/tasks_completed_barplot_withlegend.pdf', bbox_inches='tight')


In [None]:
sns.pointplot(x="zscore_mean_task_duration", y="model_size", data=df, linestyles="", errorbar="se")
plt.ylabel("")
plt.xlabel(r'$\Delta$ in Avg Task Duration ($\downarrow$ better)')
#plt.xlim(-1, 1)
plt.xlim(-120,100)
plt.yticks([0,1,2,3,4], ['GPT-3.5', 'CodeLlama-34b', 'CodeLlama-7b', 'GPT-4', 'No LLM'])#plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.legend([],[], frameon=False)
plt.savefig('figures/mean_task_duration.pdf', bbox_inches='tight')

In [None]:
def relabel(value):
    
    if value == 'autocomplete':
        return "Autocomplete"
    elif value == 'chat':
        return "Chat"
    else:
        return "No LLM"

df['interface_clean'] = df['interface'].apply(relabel)


sns.pointplot(x="zscore_n_tasks_completed", y="model_size", data=df, linestyles="")
plt.ylabel("")
plt.xlabel(r'$\Delta$ in Num Tasks Completed ($\uparrow$ better)')
plt.xlim(-2,2)
plt.tick_params(left = False , labelleft = False ) 
plt.savefig('figures/n_tasks_completed.pdf', bbox_inches='tight')

### Time spent based on coding task category

In [13]:
list_task_level_dfs = []

import ast

for row in df.itertuples():
    #temp = ast.literal_eval(row.task_data)
    try:
        temp = row.task_data

        df_temp = pd.DataFrame(temp.values()).assign(model=row.model, interface=row.interface, model_size=row.model_size, 
                                                            task_set=temp.keys())
        list_task_level_dfs.append(df_temp)
    except:
        print(row.task_data["-1"])
        continue
    
    
df_task_level = pd.concat(list_task_level_dfs, ignore_index=True).assign(has_ai = lambda x: x.model != "nomodel").query("time_in_task < 30*60 and name != 'event_scheduler'").reset_index()

In [14]:
task_display_order = ["sum_product", 
                      "t_test", "table_transform_named", "table_transform_unnamed1", "table_transform_unnamed2", 
                      "tokenizer", "calculator", "login_authenticator", "retriever",
                      "even_odd_count", "triple_sum_to_zero", "encode_message", "is_bored", "is_multiply_prime", "count_nums", "order_by_points"]

df_task_level["ordered_name"] = pd.Categorical(df_task_level["name"], categories=task_display_order, ordered=True)
df_task_level["task_category"] = df_task_level["name"].map(dict(zip(task_display_order, ["tutorial"]+["data_manipulation"]*4+["edit_code"]*4+["puzzles"]*7)))

In [None]:
sns.pointplot(y="task_category", x="time_in_task", hue="has_ai", dodge=0.25, data=df_task_level.query("task_category != 'tutorial'"), linestyles="", errorbar="se")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Avg task duration (s)", fontsize=18)
plt.ylabel("")
plt.yticks([0, 1, 2], ['Algorithmic Problems', 'Data Manipulation', 'Edit/Augment Code'])
# legend outside
plt.legend(title='LLM-assisted', loc="upper left", bbox_to_anchor=(1,1), fontsize=15)
plt.savefig('figures/task_level_duration_categories.pdf', bbox_inches='tight')


# Section 5.2

In [16]:
# add column to data called interface_plus_gpt4
df['interface_plus_gpt4'] = df['interface'] 
df.loc[df['model_clean_name'] == 'GPT-4o (chat)', 'interface_plus_gpt4'] = 'autocomplete'

In [None]:
# add one more line in the plot with no data
plt.legend([],[], frameon=False)
sns.pointplot(x="sugg_accept_rate", y="model_clean_name", data=df.query("interface_plus_gpt4 == 'autocomplete'"), linestyles="", errorbar="se")

# add one more line in the plot with no data
plt.ylabel("")
#plt.yticks([0,1,2], ['GPT-3.5', 'CodeLlama-34b', 'CodeLlama-7b'])
plt.xlabel(r'Autocomplete: % Suggestion Accepted')
plt.xlim(0,0.2)
plt.tick_params(left = False , labelleft = False ) 
plt.savefig('figures/num_sugg_accepted.pdf', bbox_inches='tight')


In [None]:
sns.pointplot(x="avg_copy_per_response", y="model_clean_name", data=df.query("interface == 'chat'"), linestyles="", errorbar="se")
plt.ylabel("")
#plt.xlabel("Mean Task Duration")
plt.xlabel(r'Chat: % Code Copied')
plt.xlim(0,0.5)
plt.yticks([0,1,2,3], ['GPT-3.5', 'CodeLlama-34b', 'CodeLlama-7b', 'GPT-4o'])
plt.savefig('figures/num_code_copied.pdf', bbox_inches='tight')


In [None]:
plt.figure(figsize=(14,14))
sns.violinplot(x="zscore_mean_task_duration", y="model_clean_name", data=df, linestyles="", errorbar="se")
plt.ylabel("")
plt.xlabel(r'$\Delta$ in Avg Task Duration ($\downarrow$ better)')
#plt.xlim(-1, 1)
#plt.xlim(-120,100)
#plt.yticks([0,1,2,3], ['GPT-3.5', 'CodeLlama-34b', 'CodeLlama-7b', 'No LLM'])#plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.legend([],[], frameon=False)


In [None]:
def relabel(value):
    
    if value == 'autocomplete':
        return "Autocomplete"
    elif value == 'chat':
        return "Chat"
    else:
        return "No AI"

df['interface_clean'] = df['interface'].apply(relabel)


sns.pointplot(x="aihelpful", y="model_clean_name", data=df.query("interface == 'chat'"), linestyles="",  errorbar="se")
plt.xlim(2,10)
plt.xlabel(r'AI Helpfulness($\uparrow$)')
plt.ylabel("")

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))


In [None]:
def relabel(value):
    
    if value == 'autocomplete':
        return "Autocomplete"
    elif value == 'chat':
        return "Chat"
    else:
        return "No AI"

df['interface_clean'] = df['interface'].apply(relabel)


sns.pointplot(x="aihelpful", y="model_size", data=df.query("interface != 'nomodel'"), linestyles="", hue="interface_clean", errorbar="se")
plt.ylabel("")
plt.xlim(0,8)
plt.xlabel(r'AI Helpfulness($\uparrow$)')
plt.tick_params(left = False , labelleft = False ) 
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('figures/ai_helpful.pdf', bbox_inches='tight')


# Stat tests

In [None]:
display(smf.ols(f"zscore_n_tasks_completed ~ C(model_size, Treatment(reference='nomodel'))", data=df).fit().summary())


In [None]:
display(smf.ols(f"zscore_mean_task_duration ~ C(model_size, Treatment(reference='nomodel'))", data=df).fit().summary())


In [None]:
display(smf.ols(f"sugg_accept_rate ~ C(model, Treatment(reference='autocomplete_llama34'))", data=df.query("interface=='autocomplete'")).fit().summary())


In [None]:
display(smf.ols(f"avg_copy_per_response ~ C(model, Treatment(reference='chat_llama7'))", data=df.query("interface=='chat'")).fit().summary())


In [None]:
display(smf.ols(f"aihelpful ~ C(interface, Treatment(reference='autocomplete'))", data=df.query("interface!='nomodel'")).fit().summary())


In [25]:
pvals = []
fit = smf.ols(f"zscore_n_tasks_completed ~ C(model_size, Treatment(reference='nomodel'))", data=df).fit()
pvals.append(fit.pvalues[1])
pvals.append(fit.pvalues[2])
pvals.append(fit.pvalues[3])
fit = smf.ols(f"zscore_mean_task_duration ~ C(model_size, Treatment(reference='nomodel'))", data=df).fit()
pvals.append(fit.pvalues[1]) # significant
pvals.append(fit.pvalues[2])
pvals.append(fit.pvalues[3])
fit = smf.ols(f"sugg_accept_rate ~ C(model, Treatment(reference='autocomplete_llama34'))", data=df.query("interface=='autocomplete'")).fit()
pvals.append(fit.pvalues[1])#significant
pvals.append(fit.pvalues[2]) 
fit = smf.ols(f"avg_copy_per_response ~ C(model, Treatment(reference='chat_llama7'))", data=df.query("interface=='chat'")).fit()
pvals.append(fit.pvalues[1])
pvals.append(fit.pvalues[2])
fit = smf.ols(f"aihelpful ~ C(interface, Treatment(reference='autocomplete'))", data=df.query("interface!='nomodel'")).fit()
pvals.append(fit.pvalues[1]) # significant


In [None]:
import statsmodels

statsmodels.stats.multitest.multipletests(pvals, method="fdr_bh")