%%script false --no-raise-error

import requests
from time import sleep
import pandas as pd


# get list of cs courses
params = {"department": "CMSC", "limit": 300}
cs_courses_data = requests.get("https://api.planetterp.com/v1/courses", params=params).json()

cs_course_nums = []

for course in cs_courses_data:
    cs_course_nums.append(course['department'] + str(course['course_number']))

# list of professors for each course so can get grades for each professor of the
# course during a specific semester to see how big lectures arec
course_professors = []

for course in cs_courses_data:
    course_professors.append(course["professors"])

class_names = []
class_sizes = []
class_profs = []

# get grades for each course in dictionary
semester = "202108" # fall 2021
for i, course in enumerate(cs_course_nums):
    for prof in course_professors[i]:
        params = {"course": course, "semester": semester, "professor": prof}

        # for each course returns a list of sections with corresponding grade data
        sections = requests.get("https://api.planetterp.com/v1/grades", params=params).json()

        print("{} {}".format(course, prof))
        for section in sections:
            # didn't get back a dictionary on the api call for some reason so skip
            # the reason seems to be inconsistent naming of professors so some
            # names of the same professor return nothing
            if not type(section) is dict:
                print("skipping")
                continue
            total_students_section = 0
            # convert section dictionary into list so only preserve items that have grade counts
            section_grades = list(section.items())[4:]
            for grade in section_grades:
                # add number of students who got this grade in this section to total students in section
                total_students_section += grade[1]
            
            # append section info to lists
            class_names.append(course)
            class_profs.append(prof)
            class_sizes.append(total_students_section)

        sleep(0.5)

# create dataframe from course counts
data_dict = {"course_name": class_names, "professor": class_profs, "count": class_sizes}

courses_df = pd.DataFrame.from_dict(data_dict)

courses_df.to_csv("umd.csv")


import pandas as pd
umd_data = pd.read_csv("class_sizes_data/umd.csv")
umd_data


classes_df = pd.read_csv("class_sizes_data/umd.csv")

# if row course_name and professor are the same, combine rows into one row

grouped = classes_df.groupby(["course_name", "professor"])

umd = grouped.agg({"count": "sum"})

umd


%%script false --no-raise-error
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import json

res = requests.post("https://oscar.gatech.edu/bprod/bwckschd.p_get_crse_unsec",
    data='term_in=202108&sel_subj=dummy&sel_day=dummy&sel_schd=dummy&sel_insm=dummy&sel_camp=dummy&sel_levl=dummy&sel_sess=dummy&sel_instr=dummy&sel_ptrm=dummy&sel_attr=dummy&sel_subj=CS&sel_crse=&sel_title=&sel_schd=%25&sel_from_cred=&sel_to_cred=&sel_camp=A&sel_ptrm=%25&sel_instr=%25&sel_attr=%25&begin_hh=0&begin_mi=0&begin_ap=a&end_hh=0&end_mi=0&end_ap=a',
    headers={
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded",
        "DNT": "1",
        "Origin": "https://oscar.gatech.edu",
        "Referer": "https://oscar.gatech.edu/bprod/bwckgens.p_proc_term_date",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"
    },
    cookies={
        "BIGipServer~BANNER~oscar.coda_443": "572230154.64288.0000"
    },
    auth=(),
)

# save response in html so don't have to do 10 sec query every time
with open("gatech.html", "w") as f:
    f.write(res.text)


%%script false --no-raise-error

# link to class with seats is of form bprod/bwckschd.p_disp_detail_sched?term_in=202108&crn_in=93519
# get all links on page and filter for links like this

f = open("gatech.html", "r")

soup = BeautifulSoup(f, 'html.parser')

links = soup.find_all('a')

class_links = []
class_titles = []

for link in links:
    if link.get('href') != None and "bprod/bwckschd.p_disp_detail_sched?term_in=202108" in link.get('href'):
        class_links.append(link.get('href'))
        class_titles.append(link.get_text())

f.close()

class_data = {"title": class_titles, "link": class_links}

classes_df = pd.DataFrame.from_dict(class_data)

# print(classes_df)

# extracting class name from title which has other junk

classes_df["class_name"] = ""

for index, row in classes_df.iterrows():
    class_name = re.findall("CS \d{4}", row['title'])[0]
    classes_df.at[index, "class_name"] = class_name

# when adding up class counts, make sure to exclude those which are:
# - recitations
# - studios
# - supervised/unsupervised laboratory
# - directed study
# - dissertation
# - thesis
# once go to link, can find class type on page, credit number
# basically only look for lecture type with credits > 0, which can be found on class page


# add columns to classes_df
# - type
# - credits
# - seat_capacity
# - filled_seats

classes_df["type"] = ""
classes_df["credits"] = 0.0
classes_df["seat_capacity"] = 0
classes_df["filled_seats"] = 0

# list of links that had problems:
# will save them to a file after program over
problem_links = []

# go to all the links and fill in these two columns
for index, row in classes_df.iterrows():
    print(row["class_name"])

    link = "https://oscar.gatech.edu" + row["link"]
    res = requests.get(link)

    class_soup = BeautifulSoup(res.content, 'html.parser')

    # page is mostly made up of tables
    tables = class_soup.find_all('table')
    # assuming that the table of interest will always be at index 3, could be problems here
    # if not always true...
    data_table = tables[3]

    match_class_type = re.search("(\w+)\* Schedule Type", data_table.text)
    if match_class_type:
        classes_df.at[index, "type"] = match_class_type.group(1)
    else:
        print("for {} was unable to find class type".format(link))
        problem_links.append(link)

    match_class_credits = re.search("(\d\.\d{3}) Credits", data_table.text)
    if match_class_credits:
        classes_df.at[index, "credits"] = match_class_credits.group(1)
    else:
        print("for {} was unable to find class credits".format(link))
        problem_links.append(link)

    match_class_seats = re.search("Seats\n(\d+)\n(\d+)\n", data_table.text)
    if match_class_seats:
        classes_df.at[index, "seat_capacity"] = match_class_seats.group(1)
        classes_df.at[index, "filled_seats"] = match_class_seats.group(2)
    else:
        print("for {} was unable to find class seats".format(link))
        problem_links.append(link)

    # update at every iteration so no data lost
    classes_df.to_csv("gatech.csv")
    
    sleep(2)
   

classes_df.to_csv("gatech.csv")

# writing problem links list to json in case have to examine later
with open("problem_links.json", "w") as f:
    f.write(json.dumps(problem_links))


gatech = pd.read_csv("class_sizes_data/gatech.csv")
# unnamed column generated for some reason so delete it
gatech = gatech.loc[:, ~gatech.columns.str.contains('^Unnamed')]
gatech


gatech = gatech[(gatech["type"] == "Lecture") & (gatech["credits"] > 0.0) & (gatech["filled_seats"] > 0)]
gatech = gatech.reset_index(drop=True)
gatech


%%script false --no-raise-error
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

res = requests.get("https://www.washington.edu/students/timeschd/AUT2021/cse.html")

soup = BeautifulSoup(res.content, 'html.parser')

# tables will be table elements with class headings
# class enrollment information in tables between headings tables
# lectures have credits > 0 associated with them, discussions seem to have QZ as credits
class_heading_tables = soup.find_all("table", attrs={"bgcolor": "#ffcccc"})

all_tables = soup.find_all("table")

# in all_tables list get index where class heading table starts
start_index = 0
while all_tables[start_index] != class_heading_tables[0]:
    start_index += 1

# dictionary with key as class heading table and values as list of tables between current class
# heading and next class heading
associated_tables = {}

current_class_table = 0
current_table = start_index + 1
current_class_associated_tables = []

while current_table < len(all_tables):
    # add class heading table to dictionary if no associated classes yet, empty list
    if len(current_class_associated_tables) == 0:
        associated_tables[class_heading_tables[current_class_table]] = []

    if current_class_table < len(class_heading_tables) - 1:
        if not all_tables[current_table] == class_heading_tables[current_class_table + 1]:
            current_class_associated_tables.append(all_tables[current_table])
        else:
            # add associated tables to current class heading table in dict
            associated_tables[class_heading_tables[current_class_table]] = current_class_associated_tables
            # clear out associated tables because moving on to new class
            current_class_associated_tables = []
            # increase current class table
            current_class_table += 1
    else: # on last class heading table
        # just add class tables to associated list and then add list to dict after while loop over
        current_class_associated_tables.append(all_tables[current_table])

    current_table += 1

# for last class heading
associated_tables[class_heading_tables[current_class_table]] = current_class_associated_tables


# print(text)

# group 1 - CSE
# group 2 - course number
# group 3 - course name, with a P at the end sometimes lol
course_search = "([A-Z]+)\s+(\d+)\s+((?:[A-Z]+\s*)+)"

# group 1 - credits either listed as 1, 1-1, or QZ
# group 2 - seats filled
# group 3 - seat capacity
section_search = "\d+\s+[A-Z0-9]+\s+([A-Z]+|\d+-\d+|\d+).*\s+(\d+)/\s+(\d+)"


class_codes = []
class_names = []
class_filled = []
class_capacity = []

for course in associated_tables:
    section_tables = associated_tables[course]
    
    heading_matches = re.search(course_search, course.text)
    print("{} {} {}".format(heading_matches.group(1), heading_matches.group(2), heading_matches.group(3)))
    
    for section in section_tables:
        section_matches = re.search(section_search, section.text)
        section_credits = section_matches.group(1)
        # checking that credits are numbers to indicate that it's a lecture, not discussion
        credits_num_search = re.search("\d+-\d+|\d+", section_credits)
        if credits_num_search:
            print("{} / {}".format(int(section_matches.group(2)), int(section_matches.group(3))))
            class_codes.append(heading_matches.group(1) + " " + heading_matches.group(2))
            class_names.append(heading_matches.group(3))
            class_filled.append(int(section_matches.group(2)))
            class_capacity.append(int(section_matches.group(3)))

classes_df = pd.DataFrame({"class_code": class_codes, "class_name": class_names, "class_filled": class_filled, "class_capacity": class_capacity})

classes_df.to_csv("uwash.csv")


uwash = pd.read_csv("class_sizes_data/uwash.csv")
# unnamed column generated for some reason so delete it
uwash = uwash.loc[:, ~uwash.columns.str.contains('^Unnamed')]
uwash


uwash = uwash[uwash["class_filled"] > 0]
uwash = uwash.reset_index(drop=True)
uwash


%%script false --no-raise-error
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import re

url = "http://crapuler.com/eecs?logname=Spring%2FSummer%2FFall+2021"

session = HTMLSession()

res = session.get(url)
res.html.render(sleep=1)

soup = BeautifulSoup(res.text, 'html.parser')

rows = soup.find("table").find_all("tr")

course_codes = []
course_links = []
course_types = []

for row in rows:
    data = row.find_all("td")
    
    i = 0
    is_fall_2021 = True
    while i < len(data) and is_fall_2021:
        if data[i].span.has_attr("title"):
            if data[i].span["title"] == "Term":
                # making sure fall 2021 course, if not go to next row
                if not "Fall 2021" in data[i].span.text:
                    is_fall_2021 = False
            elif data[i].span["title"] == "Enrollment History":
                # go up one element to get link
                link = row.find("a", href=True)
                course_codes.append(link.text)
                course_links.append("http://crapuler.com/" + link["href"])
            elif data[i].span["title"] == "Section Type":
                course_types.append(data[i].span.text)

        i += 1


# loop over course links to get individual section data

# lists for the dataframe 
course_codes_sections = []
course_types_sections = []
course_instructors_sections = []
course_times_sections = []
course_enrollments = []
course_capacities = []

for i, link in enumerate(course_links):
    res = session.get(link)
    res.html.render(sleep=2)

    soup = BeautifulSoup(res.text, 'html.parser')

    # assuming lecture table always first
    lecture_table = soup.find("table")

    # get table rows
    rows = lecture_table.find_all("tr")

    # check that this is lecture table
    if "Lectures" in rows[0].text:
        # loop rows, skip first 2 which don't have data we want
        for j in range(2, len(rows)):
            # get row data
            row_data = rows[j].find_all("td")

            course_codes_sections.append(course_codes[i])
            course_types_sections.append(course_types[i])
            
            # get instructor names
            instructor_links = row_data[1].find_all("a")
            instructors = []
            for instructor in instructor_links:
                instructors.append(instructor.text.strip())
            # sort so get consistent list of instructor names across sections if
            # need to combine some
            instructors.sort()
            course_instructors_sections.append(instructors)

            # remove whitespace for time
            time = re.sub("\s+", '', row_data[2].text)
            course_times_sections.append(time)

            # assuming enrolled always at index 4
            course_enrollments.append(int(row_data[4].text))
            # assuming capacity always at index 5
            course_capacities.append(int(row_data[5].text))
    else:
        print("at {}, lecture table not first".format(link))



classes_df = pd.DataFrame({"course_code": course_codes_sections, "course_type": course_types_sections, "course_enrollment": course_enrollments, "course_capacity": course_capacities, "course_time": course_times_sections, "course_instructors": course_instructors_sections})
classes_df.to_csv("umich.csv")


michigan = pd.read_csv("class_sizes_data/umich.csv")
michigan


%%script false --no-raise-error
import pandas as pd

classes_df = pd.read_csv("umich/umich.csv")

# if row course_code, course_time, and course_instructors are the same, 
# combine rows into one row

# ex class eecs 445

grouped = classes_df.groupby(["course_code", "course_time", "course_instructors"])

result = grouped.agg({"course_enrollment": "sum", "course_capacity": "sum"})

result.to_csv("umich_combined.csv")


michigan = pd.read_csv("class_sizes_data/umich_combined.csv")

michigan = michigan[(michigan["course_capacity"] > 0) & (michigan["course_enrollment"] > 0)]
michigan = michigan.reset_index(drop=True)
michigan


%%script false --no-raise-error
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import json
import time
import pandas as pd

url = "https://classes.berkeley.edu/search/class?page="
url2 = "&f[0]=im_field_subject_area%3A483&f[1]=im_field_term_name%3A2208"
pages = 5
session = HTMLSession()

course_names = []
course_codes = []
course_types = []
course_enrollments = []
course_capacities = []

for i in range(0, pages):
    res = session.get(url + str(i) + url2)
    res.html.render(sleep=1)

    soup = BeautifulSoup(res.text, 'html.parser')

    results = soup.find_all("li", {"class": "search-result"})


    results_data = []

    for result in results:
        # cross listed courses in tag data-json, if not cross listed, will not be key in json
        # data-json has combined enrollment counts, and enrollment for just cs version
        # course link in tag data-node, but not needed since already have enrollment counts in data-json
        results_data.append(json.loads(result.div['data-json']))

    # key - course code
    # value - list of cross listed course links so can visit pages later and enrollment counts
    cross_listing_links = {}

    for data in results_data:
        course_names.append(data["class"]["course"]["title"])
        course_codes.append(data["class"]["course"]["displayName"])
        course_types.append(data["component"]["code"])
        course_enrollments.append(data["enrollmentStatus"]["enrolledCount"])
        course_capacities.append(data["enrollmentStatus"]["maxEnroll"])

        if "crossListing" in data:
            links = []
            for attribute, value in data["crossListing"].items():
                links.append("https://classes.berkeley.edu/" + data["crossListing"][attribute]["path"])

            cross_listing_links[data["class"]["course"]["displayName"]] = links

    # handle cross listed courses by getting their enrollment info by visiting the links

    for course_code in cross_listing_links:
        for link in cross_listing_links[course_code]:
            res = session.get(link)
            res.html.render(sleep=1)
            soup = BeautifulSoup(res.text, 'html.parser')
            data = json.loads(soup.find("div", {"class": "handlebarData theme_is_whitehot"})["data-json"])
            
            # add cross listed course data
            course_names.append(data["class"]["course"]["title"])
            course_codes.append(data["class"]["course"]["displayName"])
            course_types.append(data["component"]["code"])
            course_enrollments.append(data["enrollmentStatus"]["enrolledCount"])
            course_capacities.append(data["enrollmentStatus"]["maxEnroll"])

            time.sleep(1)

    # sleep for each page
    time.sleep(1)


classes_df = pd.DataFrame({"course_code": course_codes, "course_type": course_types, "course_name": course_names, "course_enrollment": course_enrollments, "course_capacity": course_capacities})
classes_df.to_csv("ucb.csv")


berkeley = pd.read_csv("class_sizes_data/ucb.csv")
# unnamed column generated for some reason so delete it
berkeley = berkeley.loc[:, ~berkeley.columns.str.contains('^Unnamed')]
berkeley


berkeley = berkeley[(berkeley["course_type"] == "LEC") & (berkeley["course_enrollment"] > 0)]
berkeley = berkeley.reset_index(drop=True)
berkeley


faculty_counts = pd.read_csv("faculty_count.csv")
faculty_counts


degrees_conferred = pd.read_csv("degrees_conferred.csv")
degrees_conferred


rankings = pd.read_csv("rankings.csv")
rankings


alumni = pd.read_csv("school_alumni.csv")
alumni


cost = pd.read_csv("cost.csv")
cost


abbreviations = pd.read_csv("abbreviations.csv")
abbreviations


# first change Cornell and Washington just to have their estimates instead of the "+" in the estimate
degrees_conferred.at[7, "Degrees Conferred"] = 300
degrees_conferred.at[8, "Degrees Conferred"] = 450
# convert column to numeric
degrees_conferred["Degrees Conferred"] = pd.to_numeric(degrees_conferred["Degrees Conferred"])
degrees_conferred["estimated_majors"] = degrees_conferred["Degrees Conferred"] * 4

estimated_majors = pd.concat([degrees_conferred["University"], degrees_conferred["estimated_majors"]], axis=1)

estimated_majors


import math

def estimate_cost(row):
    if not math.isnan(row["in state"]):
        return row["in state"] * 0.75 + row["out of state"] * 0.25
    else:
        return row["out of state"]
    
cost["estimated_cost"] = cost.apply(lambda row: estimate_cost(row), axis=1)
cost


cs_depts = pd.concat([rankings["University"], abbreviations["Abbreviation"],
                      cost["type"],rankings["CSRankgings Rank"], 
                      rankings["USNews Rank"], faculty_counts["Faculty Count"], 
                      estimated_majors["estimated_majors"], cost["estimated_cost"]], axis=1)

# renaming columns
cs_depts = cs_depts.rename(columns={"University": "uni", "Abbreviation": "abbrv", "CSRankgings Rank": "csr_rank", "USNews Rank": "usn_rank", "Faculty Count": "faculty_count"})

cs_depts


import matplotlib.pyplot as plt
import seaborn as sns

# setting size of Seaborn plots
sns.set(rc={'figure.figsize':(11.7,8.27)})

# add school name abbreviation to each scraped class size dataframe
umd["school_name"] = "maryland"
gatech["school_name"] = "gatech"
uwash["school_name"] = "washington"
michigan["school_name"] = "michigan"
berkeley["school_name"] = "cal"

# create the dataset for the violin plot
umd_sizes = pd.DataFrame({"school_name": umd["school_name"].values, "class_size": umd["count"].values})
gatech_sizes = pd.DataFrame({"school_name": gatech["school_name"].values, "class_size": gatech["filled_seats"].values})
uwash_sizes = pd.DataFrame({"school_name": uwash["school_name"].values, "class_size": uwash["class_filled"].values})
michigan_sizes = pd.DataFrame({"school_name": michigan["school_name"].values, "class_size": michigan["course_enrollment"].values})
berkeley_sizes = pd.DataFrame({"school_name": berkeley["school_name"].values, "class_size": berkeley["course_enrollment"].values})

all_class_sizes = pd.concat([umd_sizes, gatech_sizes, uwash_sizes, michigan_sizes, berkeley_sizes], axis=0)

fig, axes = plt.subplots()

sns.violinplot(x="school_name", y="class_size", data=all_class_sizes, ax=axes, cut=0)
axes.set_title("Distribution of Lecture Sizes at Selected Schools")
axes.set_xlabel("School Name")
axes.set_ylabel("Lecture Size")

Text(0, 0.5, 'Lecture Size')


# getting into correct format df
# lecture size bounds
bounds = [[1, 20], [21, 40], [41, 80], [81, 150], [151, 300], [301, 600], [601, 1000], [1001, 2000]]

# schools
schools = ["maryland", "gatech", "washington", "michigan", "cal"]

school_data = []
bound_strs = []
count_data = []
percentage_data = []

def add_bounds_data(school_name, school_df, count_name):
    for bound in bounds:
        school_data.append(school_name)
        bound_strs.append(str(bound[0]) + "-" + str(bound[1]))
        courses_num = school_df[(school_df[count_name] >= bound[0]) & (school_df[count_name] <= bound[1])].shape[0]
        count_data.append(courses_num)
        percentage_data.append(courses_num / float(school_df.shape[0]) * 100.0)
        

add_bounds_data("maryland", umd, "count")
add_bounds_data("gatech", gatech, "filled_seats")
add_bounds_data("washington", uwash, "class_filled")
add_bounds_data("michigan", michigan, "course_enrollment")
add_bounds_data("cal", berkeley, "course_enrollment")

# df with school, bound, count, percentage
bounds_df = pd.DataFrame({"school": school_data, "bound": bound_strs, "count": count_data, "percentage": percentage_data})

bounds_df


import plotly.express as px

colors = px.colors.qualitative.Pastel

# the color of each box depends on the bound so that the reader can easily visually compare different schools
fig = px.treemap(bounds_df, path=[px.Constant("Lecture Sizes at Schools"), 'school', 'bound'], values='percentage', 
                 color='bound', color_discrete_map={'(?)': 'rgb(234, 234, 234)', '1-20': colors[4], '21-40': colors[1], '41-80': colors[5], '81-150': colors[2], '151-300': colors[9], '301-600': colors[6], '601-1000': colors[0], '1001-2000': colors[10]})
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()


schools = ["maryland", "gatech", "washington", "michigan", "cal"]
ratios = [umd_sizes["class_size"].mean(), gatech_sizes["class_size"].mean(), uwash_sizes["class_size"].mean(), michigan_sizes["class_size"].mean(), berkeley_sizes["class_size"].mean()]

fig = sns.barplot(y=schools, x=ratios)
fig.set_title("Average Lecture Size at Selected Universities")
fig.set_xlabel("Lecture Size")
fig.set_ylabel("School")

Text(0, 0.5, 'School')


ratios = [umd_sizes["class_size"].median(), gatech_sizes["class_size"].median(), uwash_sizes["class_size"].median(), michigan_sizes["class_size"].median(), berkeley_sizes["class_size"].median()]

fig = sns.barplot(y=schools, x=ratios)
fig.set_title("Median Lecture Size at Selected Universities")
fig.set_xlabel("Lecture Size")
fig.set_ylabel("School")

Text(0, 0.5, 'School')


cs_depts["ratio"] = cs_depts["estimated_majors"] / cs_depts["faculty_count"]

fig = sns.barplot(y=cs_depts["abbrv"].values, x=cs_depts["ratio"].values)
fig.bar_label(fig.containers[0])
fig.set_title("Student:Faculty Ratio at Selected Universities")
fig.set_xlabel("Student:Faculty")
fig.set_ylabel("School")

Text(0, 0.5, 'School')


private = cs_depts[cs_depts["type"] == "private"]
public = cs_depts[cs_depts["type"] == "public"]

x = ["public", "private"]
y = [public["ratio"].mean(), private["ratio"].mean()]

fig = sns.barplot(x=x, y=y)
fig.bar_label(fig.containers[0])
fig.set_title("Average Student:Faculty Ratio at Public and Private Universities")
fig.set_xlabel("Type")
fig.set_ylabel("Student:Faculty Ratio")

Text(0, 0.5, 'Student:Faculty Ratio')


cs_depts["dol_per_fac"] = cs_depts["estimated_cost"] * cs_depts["estimated_majors"] / cs_depts["faculty_count"]

fig = sns.barplot(y=cs_depts["abbrv"].values, x=cs_depts["dol_per_fac"].values)
fig.bar_label(fig.containers[0])
fig.set_title("Income Per Faculty at Selected Universities")
fig.set_xlabel("Income Per Faculty")
fig.set_ylabel("School")

Text(0, 0.5, 'School')


fig = sns.regplot(x=cs_depts["csr_rank"], y=cs_depts["ratio"])
# labeling
for i, txt in enumerate(cs_depts["abbrv"]):
    c = "black"
    if "maryland" in txt:
        c = "red"
    fig.annotate(txt, (cs_depts["csr_rank"][i], cs_depts["ratio"][i]), color=c)

fig.set_title("Student:Faculty vs. CSRankings Rank")
fig.set_xlabel("CSRankings Rank")
fig.set_ylabel("Student:Faculty")

Text(0, 0.5, 'Student:Faculty')


import statsmodels.api as sm

y = cs_depts["ratio"].values
x = cs_depts["csr_rank"].values
x = sm.add_constant(x)

mod = sm.OLS(y, x)

res = mod.fit()
res.summary()


fig = sns.regplot(x=cs_depts["usn_rank"], y=cs_depts["ratio"])
# labeling
for i, txt in enumerate(cs_depts["abbrv"]):
    c = "black"
    if "maryland" in txt:
        c = "red"
    fig.annotate(txt, (cs_depts["usn_rank"][i], cs_depts["ratio"][i]), color=c)
    
fig.set_title("Student:Faculty vs. USNews Rank")
fig.set_xlabel("USNews Rank")
fig.set_ylabel("Student:Faculty")

Text(0, 0.5, 'Student:Faculty')


y = cs_depts["ratio"].values
x = cs_depts["usn_rank"].values
x = sm.add_constant(x)

mod = sm.OLS(y, x)

res = mod.fit()
res.summary()


fig = sns.regplot(x=cs_depts["usn_rank"], y=cs_depts["csr_rank"])

# labeling
for i, txt in enumerate(cs_depts["abbrv"]):
    c = "black"
    if "maryland" in txt:
        c = "red"
    fig.annotate(txt, (cs_depts["usn_rank"][i], cs_depts["csr_rank"][i]), color=c)
    
# plot slope of 1 line with intercept at 0, shows which schools perform better on which ranking system
fig.plot([0,40], [0,40], linewidth = 2, color="r")
    
fig.set_title("USNews Rank vs. CSRankings Rank")
fig.set_xlabel("USNews Rank")
fig.set_ylabel("CSRankings Rank")

Text(0, 0.5, 'CSRankings Rank')


y = cs_depts["csr_rank"].values
x = cs_depts["usn_rank"].values
x = sm.add_constant(x)

mod = sm.OLS(y, x)

res = mod.fit()
res.summary()


alumni = alumni.groupby("school", sort=False)
alumni = alumni.agg({"count": "sum"})

# add alumni count to cs_depts dataframe
cs_depts["alumni"] = alumni["count"].values

# standardize alumni count based on esimated majors
cs_depts["alumni"] = cs_depts["alumni"] / cs_depts["estimated_majors"]

fig = sns.regplot(x=cs_depts["usn_rank"], y=cs_depts["alumni"])

# labeling
for i, txt in enumerate(cs_depts["abbrv"]):
    c = "black"
    if "maryland" in txt:
        c = "red"
    fig.annotate(txt, (cs_depts["usn_rank"][i], cs_depts["alumni"][i]), color=c)
    
fig.set_title("USNews Rank vs. Alumni in Big Tech")
fig.set_xlabel("USNews Rank")
fig.set_ylabel("Standardized Alumni")

Text(0, 0.5, 'Standardized Alumni')


y = cs_depts["alumni"].values
x = cs_depts["usn_rank"].values
x = sm.add_constant(x)

mod = sm.OLS(y, x)

res = mod.fit()
res.summary()


fig = sns.regplot(x=cs_depts["csr_rank"], y=cs_depts["alumni"])

# labeling
for i, txt in enumerate(cs_depts["abbrv"]):
    c = "black"
    if "maryland" in txt:
        c = "red"
    fig.annotate(txt, (cs_depts["csr_rank"][i], cs_depts["alumni"][i]), color=c)
    
fig.set_title("CSRankings Rank vs. Alumni in Big Tech")
fig.set_xlabel("CSRankings Rank")
fig.set_ylabel("Standardized Alumni")

Text(0, 0.5, 'Standardized Alumni')


y = cs_depts["alumni"].values
x = cs_depts["csr_rank"].values
x = sm.add_constant(x)

mod = sm.OLS(y, x)

res = mod.fit()
res.summary()

	Unnamed: 0	course_name	professor	count
0	0	CMSC100	Jessica Long	33
1	1	CMSC100	Corie Brown	17
2	2	CMSC100	Veronica Sanchez	19
3	3	CMSC106	Jandelyn Plane	51
4	4	CMSC122	Pedram Sadeghian	83
...	...	...	...	...
183	183	CMSC829A	Erin Molloy	21
184	184	CMSC838G	Leonidas Lampropoulos	11
185	185	CMSC848B	Christopher Metzler	21
186	186	CMSC858G	Michael Gullans	28
187	187	CMSC858O	Xiaodi Wu	12

		count
course_name	professor
CMSC100	Corie Brown	17
	Jessica Long	33
	Veronica Sanchez	19
CMSC106	Jandelyn Plane	51
CMSC122	Jennifer Manly	61
...	...	...
CMSC829A	Erin Molloy	21
CMSC838G	Leonidas Lampropoulos	11
CMSC848B	Christopher Metzler	21
CMSC858G	Michael Gullans	28
CMSC858O	Xiaodi Wu	12

	title	link	class_name	type	credits	seat_capacity	filled_seats
0	Freshman Leap Seminar - 87695 - CS 1100 - A1	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	200	171
1	Freshman Leap Seminar - 87696 - CS 1100 - A2	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	240	240
2	Freshman Leap Seminar - 89255 - CS 1100 - A3	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	125	125
3	Freshman Leap Seminar - 94424 - CS 1100 - A4	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	280	260
4	Freshman Leap Seminar - 92598 - CS 1100 - B1	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	240	239
...	...	...	...	...	...	...	...
1441	Doctoral Thesis - 93112 - CS 9000 - W13	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 9000	Dissertation	1.0	10	1
1442	Doctoral Thesis - 81156 - CS 9000 - X01	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 9000	Dissertation	1.0	15	1
1443	Doctoral Thesis - 95063 - CS 9000 - X03	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 9000	Dissertation	1.0	10	0
1444	Doctoral Thesis - 94765 - CS 9000 - Y04	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 9000	Dissertation	1.0	10	1
1445	Doctoral Thesis - 80218 - CS 9000 - Z01	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 9000	Dissertation	1.0	15	1

	title	link	class_name	type	credits	seat_capacity	filled_seats
0	Freshman Leap Seminar - 87695 - CS 1100 - A1	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	200	171
1	Freshman Leap Seminar - 87696 - CS 1100 - A2	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	240	240
2	Freshman Leap Seminar - 89255 - CS 1100 - A3	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	125	125
3	Freshman Leap Seminar - 94424 - CS 1100 - A4	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	280	260
4	Freshman Leap Seminar - 92598 - CS 1100 - B1	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 1100	Lecture	1.0	240	239
...	...	...	...	...	...	...	...
169	Special Topics - 93832 - CS 8803 - PHI	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 8803	Lecture	3.0	30	20
170	Special Topics - 87122 - CS 8803 - PP	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 8803	Lecture	3.0	15	12
171	Special Topics - 93244 - CS 8803 - SG	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 8803	Lecture	3.0	10	8
172	Special Topics - 92061 - CS 8803 - SMC	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 8803	Lecture	3.0	40	16
173	Special Topics - 92451 - CS 8803 - SMR	/bprod/bwckschd.p_disp_detail_sched?term_in=20...	CS 8803	Lecture	3.0	48	39

	class_code	class_name	class_filled	class_capacity
0	CSE 142	COMPUTER PRGRMNG I	519	555
1	CSE 142	COMPUTER PRGRMNG I	285	355
2	CSE 143	COMPUTER PRGRMNG II	330	400
3	CSE 143	COMPUTER PRGRMNG II	135	235
4	CSE 143	COMPUTER PRGRMNG II	63	72
...	...	...	...	...
167	CSE 599	SPECIAL TOPICS	0	40
168	CSE 600	INDEPNDNT STDY	307	95
169	CSE 601	INTERNSHIP	78	70
170	CSE 700	MASTERS THESIS	0	30
171	CSE 800	DOCTORAL DISSERTATN	69	60

CS Departments From an Undergraduate Perspective¶

Table of Contents¶

Introduction and Motivation¶

Data Collection¶

Class Sizes Data¶

University of Maryland¶

Cleaning UMD Data¶

Georgia Tech¶

Cleaning Georgia Tech Data¶

University of Washington¶

Cleaning Washington Data¶

University of Michigan¶

Cleaning Michigan Data¶

University of California Berkeley¶

Cleaning Berkeley Data¶

Faculty Counts by Institution¶

Degrees Conferred by Institution¶

Notes About a Few Schools¶

Rankings¶

School Alumni¶

School Cost¶

School Abbreviations¶

Data Representation¶

Degrees Conferred to Estimated CS Major Count¶

Cost by School¶

Combining Dataframes¶

Exploratory Data Analysis and Hypothesis Testing¶

Violin Plot of 5 Schools' Lecture Sizes¶

Treemap of Lecture Size¶

Lecture Sizes of 5 Schools Based on Scraped Data¶

Student to Faculty Ratio based on CSRankings Faculty Counts and Estimated Majors¶

Average Student:Faculty Ratio at Public and Private Universities¶

Dollars of Estimated Tuition Income Per Faculty¶

School Ranking vs. Student:Faculty Ratio¶

USNews Ranking vs. CSRankings Ranking¶

School Ranking vs. Alumni in Big Tech¶

Conclusion¶

Limitations¶

Insights¶

Further Research¶

	Unnamed: 0	course_code	course_type	course_enrollment	course_capacity	course_time	course_instructors
0	0	EECS 183	Lecture	220	245	TuTh8:30AM-10:00AM	['Tabassum, Nuzhat', 'Torralva, Ben']
1	1	EECS 183	Lecture	189	192	TuTh11:30AM-1:00PM	['Kloosterman, John Sidney', 'Tabassum, Nuzhat']
2	2	EECS 183	Lecture	250	250	TuTh1:00PM-2:30PM	['Arthur, William', 'Tabassum, Nuzhat']
3	3	EECS 183	Lecture	224	225	TuTh2:30PM-4:00PM	['Kloosterman, John Sidney', 'Tabassum, Nuzhat']
4	4	EECS 183	Lecture	257	260	TuTh4:00PM-5:30PM	['Arthur, William', 'Tabassum, Nuzhat']
...	...	...	...	...	...	...	...
244	244	ENGR 101	Lecture	182	180	TBATBA	['Burdick, Laura']
245	245	ENGR 101	Lecture	176	172	TBATBA	['Alford, Laura']
246	246	ENGR 101	Lecture	181	176	TBATBA	['Alford, Laura']
247	247	ENGR 101	Lecture	185	184	TBATBA	['Alford, Laura']
248	248	ENGR 151	Lecture	205	220	TuTh12:00PM-1:30PM	['Thomas, Alexander George Roy']

	course_code	course_type	course_name	course_enrollment	course_capacity
0	COMPSCI C8	LEC	Foundations of Data Science	0	0
1	COMPSCI 10	LEC	The Beauty and Joy of Computing	370	350
2	COMPSCI 24	SEM	Freshman Seminars	0	12
3	COMPSCI 24	SEM	Freshman Seminars	13	15
4	COMPSCI 24	SEM	Freshman Seminars	20	20
...	...	...	...	...	...
79	COMPSCI 298	LEC	Group Studies Seminars, or Group Research	22	28
80	COMPSCI 298	LEC	Group Studies Seminars, or Group Research	11	20
81	COMPSCI 370	LEC	Adaptive Instruction Methods in Computer Science	10	80
82	COMPSCI 370	LEC	Adaptive Instruction Methods in Computer Science	28	80
83	COMPSCI 375	DIS	Teaching Techniques for Computer Science	94	100

	course_code	course_type	course_name	course_enrollment	course_capacity
0	COMPSCI 10	LEC	The Beauty and Joy of Computing	370	350
1	COMPSCI 61A	LEC	The Structure and Interpretation of Computer P...	1666	1900
2	COMPSCI 61B	LEC	Data Structures	904	1000
3	COMPSCI 61C	LEC	Great Ideas of Computer Architecture (Machine ...	962	1175
4	COMPSCI 70	LEC	Discrete Mathematics and Probability Theory	785	850
5	COMPSCI 88	LEC	Computational Structures in Data Science	261	265
6	COMPSCI 161	LEC	Computer Security	609	625
7	COMPSCI 162	LEC	Operating Systems and System Programming	415	410
8	COMPSCI 164	LEC	Programming Languages and Compilers	124	128
9	DATA C8	LEC	Foundations of Data Science	1436	1440
10	DATA C100	LEC	Principles & Techniques of Data Science	919	920
11	COMPSCI 169A	LEC	Introduction to Software Engineering	296	315
12	COMPSCI 170	LEC	Efficient Algorithms and Intractable Problems	722	730
13	COMPSCI 188	LEC	Introduction to Artificial Intelligence	651	685
14	COMPSCI 189	LEC	Introduction to Machine Learning	270	290
15	COMPSCI 194	LEC	Special Topics	70	98
16	COMPSCI 194	LEC	Special Topics	180	177
17	COMPSCI 194	LEC	Special Topics	83	80
18	COMPSCI 194	LEC	Special Topics	34	40
19	COMPSCI 194	LEC	Special Topics	174	190
20	COMPSCI 195	LEC	Social Implications of Computer Technology	337	430
21	CHEM C191	LEC	Quantum Information Science and Technology	15	27
22	PHYSICS C191	LEC	Quantum Information Science and Technology	56	75
23	COMPSCI C249A	LEC	Introduction to Embedded Systems	7	8
24	COMPSCI 262A	LEC	Advanced Topics in Computer Systems	39	45
25	COMPSCI 263	LEC	Design of Programming Languages	41	59
26	COMPSCI 264	LEC	Implementation of Programming Languages	16	27
27	COMPSCI C281A	LEC	Statistical Learning Theory	86	120
28	COMPSCI 285	LEC	Deep Reinforcement Learning, Decision Making, ...	228	235
29	ELENG C249A	LEC	Introduction to Embedded Systems	15	17
30	COMPSCI 289A	LEC	Introduction to Machine Learning	65	50
31	COMPSCI 294	LEC	Special Topics	18	0
32	COMPSCI 294	LEC	Special Topics	31	29
33	COMPSCI 294	LEC	Special Topics	98	98
34	COMPSCI 294	LEC	Special Topics	1	18
35	COMPSCI 294	LEC	Special Topics	44	45
36	COMPSCI 294	LEC	Special Topics	16	30
37	COMPSCI 294	LEC	Special Topics	26	32
38	COMPSCI 294	LEC	Special Topics	49	100
39	COMPSCI 294	LEC	Special Topics	49	50
40	COMPSCI 294	LEC	Special Topics	32	0
41	COMPSCI 294	LEC	Special Topics	8	35
42	COMPSCI 294	LEC	Special Topics	60	100
43	COMPSCI 294	LEC	Special Topics	25	30
44	COMPSCI 294	LEC	Special Topics	13	35
45	COMPSCI 294	LEC	Special Topics	16	20
46	COMPSCI 298	LEC	Group Studies Seminars, or Group Research	1	98
47	COMPSCI 298	LEC	Group Studies Seminars, or Group Research	22	28
48	COMPSCI 298	LEC	Group Studies Seminars, or Group Research	11	20
49	COMPSCI 370	LEC	Adaptive Instruction Methods in Computer Science	10	80
50	COMPSCI 370	LEC	Adaptive Instruction Methods in Computer Science	28	80

	University	Faculty Count
0	Carnegie Mellon University	161
1	University of Illinois at Urbana-Champaign	109
2	Massachusetts Institute of Technology	89
3	University of California - San Diego	108
4	Stanford University	68
5	University of Michigan	96
6	University of California - Berkeley	91
7	Cornell University	81
8	University of Washington	75
9	Georgia Institute of Technology	94
10	University of Maryland - College Park	85
11	University of Wisconsin - Madison	62
12	Columbia University	55
13	Northeastern University	73
14	University of Texas at Austin	47
15	University of Pennsylvania	63
16	Purdue University	69
17	University of California - Los Angeles	42
18	University of Southern California	46
19	University of Massachusetts Amherst	57
20	New York University	54
21	University of California - Santa Barbara	37
22	Rutgers University	50
23	University of California - Irvine	69
24	Stony Brook University	54
25	Princeton University	41
26	University of Chicago	44
27	Pennsylvannia State University	59
28	Harvard University	29
29	Northwestern University	42

	University	Degrees Conferred	Degrees Conferred Source	Date	Notes
0	Carnegie Mellon University	230	https://www.cmu.edu/ira/degrees-granted/ay2020...	20-21	NaN
1	University of Illinois at Urbana-Champaign	544	https://cs.illinois.edu/about/statistics	20-21	NaN
2	Massachusetts Institute of Technology	408	https://registrar.mit.edu/stats-reports/degree...	20-21	added together cs eng, cs molec bio, cs econ, ...
3	University of California - San Diego	799	https://ir.ucsd.edu/undergrad/stats-data/ug-de...	20-21	cs and ds
4	Stanford University	307	https://registrar.stanford.edu/everyone/degree...	19-20	NaN
5	University of Michigan	1186	https://cse.engin.umich.edu/about/by-the-numbe...	20-21	second link has current undergraduate enrollme...
6	University of California - Berkeley	1938	https://pages.github.berkeley.edu/OPA/our-berk...	20-21	ls cs - 802, ls ds - 620, eecs - 516
7	Cornell University	300+	https://www.cs.cornell.edu/information/about	20-21	assuming most recent data, esitmate not exact :(
8	University of Washington	450+	https://www.cs.washington.edu/about_us/	20-21	estimate, not exact
9	Georgia Institute of Technology	737	https://lite.gatech.edu/lite_script/dashboards...	20-21	NaN
10	University of Maryland - College Park	797	https://irpa.umd.edu/CampusCounts/Retention/ug...	20-21	NaN
11	University of Wisconsin - Madison	675	https://tableau.wisconsin.edu/views/TrendsinDe...	20-21	NaN
12	Columbia University	190	https://opir.columbia.edu/sites/default/files/...	20-21	NaN
13	Northeastern University	314	https://datausa.io/profile/university/northeas...	19-20	ipeds source
14	University of Texas at Austin	385	https://catalog.utexas.edu/general-information...	19-20	NaN
15	University of Pennsylvania	107	https://www.scholarships.com/colleges/universi...	19-20	not the best source
16	Purdue University	466	https://wpvwebsas03.www.purdue.edu/cgi-bin/bro...	20-21	includes datascience
17	University of California - Los Angeles	334	https://www.cs.ucla.edu/	20-21	1337 is total majors, divided by 4 to get appr...
18	University of Southern California	247	https://oir.usc.edu/wp-content/uploads/2021/09...	20-21	NaN
19	University of Massachusetts Amherst	271	https://www.umass.edu/uair/sites/default/files...	20-21	NaN
20	New York University	384	https://datausa.io/profile/university/new-york...	18-19	bad source, includes infosci but no info sci m...
21	University of California - Santa Barbara	160	https://bap.ucsb.edu/institutional-research/uc...	20-21	NaN
22	Rutgers University	572	https://datausa.io/profile/university/rutgers-...	19-20	bad source, may include info sci
23	University of California - Irvine	1063	https://datahub.oapir.uci.edu/Degrees-Awarded-...	20-21	includes: cse, cs, ds, computer game science, ...
24	Stony Brook University	250	https://datausa.io/profile/university/stony-br...	19-20	bad source??
25	Princeton University	150	https://tableaupublic.princeton.edu/t/Officeof...	20-21	NaN
26	University of Chicago	80	https://www.scholarships.com/colleges/universi...	19-20	from ipeds
27	Pennsylvannia State University	413	https://datadigest.psu.edu/degrees-awarded/	20-21	includes: cs, ds
28	Harvard University	131	https://oir.harvard.edu/fact-book/degrees-awar...	20-21	NaN
29	Northwestern University	107	https://www.registrar.northwestern.edu/documen...	20-21	NaN

	school	company	count
0	Carnegie Mellon University	Meta	621
1	Carnegie Mellon University	Amazon	1118
2	Carnegie Mellon University	Apple	1011
3	Carnegie Mellon University	Microsoft	1101
4	Carnegie Mellon University	Google	2782
...	...	...	...
145	Northwestern University	Meta	310
146	Northwestern University	Amazon	623
147	Northwestern University	Apple	393
148	Northwestern University	Microsoft	424
149	Northwestern University	Google	899

	school	type	out of state	in state
0	Carnegie Mellon University	private	58924	NaN
1	University of Illinois at Urbana-Champaign	public	34316	16866.0
2	Massachusetts Institute of Technology	private	55878	NaN
3	University of California - San Diego	public	44487	14733.0
4	Stanford University	private	56169	NaN
5	University of Michigan	public	53232	16178.0
6	University of California - Berkeley	public	44115	14361.0
7	Cornell University	private	61015	NaN
8	University of Washington	public	39906	12076.0
9	Georgia Institute of Technology	public	33794	12682.0
10	University of Maryland - College Park	public	38636	10954.0
11	University of Wisconsin - Madison	public	38608	10720.0
12	Columbia University	private	63530	NaN
13	Northeastern University	private	57592	NaN
14	University of Texas at Austin	public	40032	11448.0
15	University of Pennsylvania	private	61710	NaN
16	Purdue University	public	28794	9992.0
17	University of California - Los Angeles	public	43022	13268.0
18	University of Southern California	private	60275	NaN
19	University of Massachusetts Amherst	public	36964	16439.0
20	New York University	private	56500	NaN
21	University of California - Santa Barbara	public	44196	14442.0
22	Rutgers University	public	33005	15804.0
23	University of California - Irvine	public	43709	13955.0
24	Stony Brook University	public	28080	10410.0
25	Princeton University	private	56010	NaN
26	University of Chicago	private	60963	NaN
27	Pennsylvannia State University	public	36476	18898.0
28	Harvard University	private	55587	NaN
29	Northwestern University	private	60984	NaN

	University	Abbreviation
0	Carnegie Mellon University	cmu
1	University of Illinois at Urbana-Champaign	uiuc
2	Massachusetts Institute of Technology	mit
3	University of California - San Diego	ucsd
4	Stanford University	stanford
5	University of Michigan	michigan
6	University of California - Berkeley	cal
7	Cornell University	cornell
8	University of Washington	washington
9	Georgia Institute of Technology	gatech
10	University of Maryland - College Park	maryland
11	University of Wisconsin - Madison	wisconsin
12	Columbia University	columbia
13	Northeastern University	northeastern
14	University of Texas at Austin	texas
15	University of Pennsylvania	upenn
16	Purdue University	purdue
17	University of California - Los Angeles	ucla
18	University of Southern California	usc
19	University of Massachusetts Amherst	umass
20	New York University	nyu
21	University of California - Santa Barbara	ucsb
22	Rutgers University	rutgers
23	University of California - Irvine	uci
24	Stony Brook University	sbu
25	Princeton University	princeton
26	University of Chicago	uchicago
27	Pennsylvannia State University	psu
28	Harvard University	harvard
29	Northwestern University	northwestern

	University	estimated_majors
0	Carnegie Mellon University	920
1	University of Illinois at Urbana-Champaign	2176
2	Massachusetts Institute of Technology	1632
3	University of California - San Diego	3196
4	Stanford University	1228
5	University of Michigan	4744
6	University of California - Berkeley	7752
7	Cornell University	1200
8	University of Washington	1800
9	Georgia Institute of Technology	2948
10	University of Maryland - College Park	3188
11	University of Wisconsin - Madison	2700
12	Columbia University	760
13	Northeastern University	1256
14	University of Texas at Austin	1540
15	University of Pennsylvania	428
16	Purdue University	1864
17	University of California - Los Angeles	1336
18	University of Southern California	988
19	University of Massachusetts Amherst	1084
20	New York University	1536
21	University of California - Santa Barbara	640
22	Rutgers University	2288
23	University of California - Irvine	4252
24	Stony Brook University	1000
25	Princeton University	600
26	University of Chicago	320
27	Pennsylvannia State University	1652
28	Harvard University	524
29	Northwestern University	428

	uni	abbrv	type	csr_rank	usn_rank	faculty_count	estimated_majors	estimated_cost
0	Carnegie Mellon University	cmu	private	1	1	161	920	58924.00
1	University of Illinois at Urbana-Champaign	uiuc	public	2	5	109	2176	21228.50
2	Massachusetts Institute of Technology	mit	private	3	1	89	1632	55878.00
3	University of California - San Diego	ucsd	public	4	16	108	3196	22171.50
4	Stanford University	stanford	private	5	1	68	1228	56169.00
5	University of Michigan	michigan	public	6	11	96	4744	25441.50
6	University of California - Berkeley	cal	public	7	1	91	7752	21799.50
7	Cornell University	cornell	private	8	6	81	1200	61015.00
8	University of Washington	washington	public	9	6	75	1800	19033.50
9	Georgia Institute of Technology	gatech	public	10	8	94	2948	17960.00
10	University of Maryland - College Park	maryland	public	11	16	85	3188	17874.50
11	University of Wisconsin - Madison	wisconsin	public	12	13	62	2700	17692.00
12	Columbia University	columbia	private	13	13	55	760	63530.00
13	Northeastern University	northeastern	private	13	49	73	1256	57592.00
14	University of Texas at Austin	texas	public	15	10	47	1540	18594.00
15	University of Pennsylvania	upenn	private	16	19	63	428	61710.00
16	Purdue University	purdue	public	17	20	69	1864	14692.50
17	University of California - Los Angeles	ucla	public	18	13	42	1336	20706.50
18	University of Southern California	usc	private	19	20	46	988	60275.00
19	University of Massachusetts Amherst	umass	public	20	20	57	1084	21570.25
20	New York University	nyu	private	21	30	54	1536	56500.00
21	University of California - Santa Barbara	ucsb	public	22	37	37	640	21880.50
22	Rutgers University	rutgers	public	23	37	50	2288	20104.25
23	University of California - Irvine	uci	public	24	30	69	4252	21393.50
24	Stony Brook University	sbu	public	25	40	54	1000	14827.50
25	Princeton University	princeton	private	26	8	41	600	56010.00
26	University of Chicago	uchicago	private	27	30	44	320	60963.00
27	Pennsylvannia State University	psu	public	27	30	59	1652	23292.50
28	Harvard University	harvard	private	29	16	29	524	55587.00
29	Northwestern University	northwestern	private	30	30	42	428	60984.00

	school	bound	count	percentage
0	maryland	1-20	10	12.195122
1	maryland	21-40	21	25.609756
2	maryland	41-80	16	19.512195
3	maryland	81-150	18	21.951220
4	maryland	151-300	10	12.195122
5	maryland	301-600	7	8.536585
6	maryland	601-1000	0	0.000000
7	maryland	1001-2000	0	0.000000
8	gatech	1-20	42	24.137931
9	gatech	21-40	27	15.517241
10	gatech	41-80	43	24.712644
11	gatech	81-150	22	12.643678
12	gatech	151-300	39	22.413793
13	gatech	301-600	1	0.574713
14	gatech	601-1000	0	0.000000
15	gatech	1001-2000	0	0.000000
16	washington	1-20	54	43.200000
17	washington	21-40	24	19.200000
18	washington	41-80	15	12.000000
19	washington	81-150	17	13.600000
20	washington	151-300	12	9.600000
21	washington	301-600	3	2.400000
22	washington	601-1000	0	0.000000
23	washington	1001-2000	0	0.000000
24	michigan	1-20	31	18.343195
25	michigan	21-40	26	15.384615
26	michigan	41-80	36	21.301775
27	michigan	81-150	44	26.035503
28	michigan	151-300	29	17.159763
29	michigan	301-600	3	1.775148
30	michigan	601-1000	0	0.000000
31	michigan	1001-2000	0	0.000000
32	cal	1-20	13	25.490196
33	cal	21-40	8	15.686275
34	cal	41-80	8	15.686275
35	cal	81-150	4	7.843137
36	cal	151-300	6	11.764706
37	cal	301-600	3	5.882353
38	cal	601-1000	7	13.725490
39	cal	1001-2000	2	3.921569

OLS Regression Results
Dep. Variable:	y	R-squared:	0.016
Model:	OLS	Adj. R-squared:	-0.019
Method:	Least Squares	F-statistic:	0.4676
Date:	Mon, 16 May 2022	Prob (F-statistic):	0.500
Time:	18:03:27	Log-Likelihood:	-127.13
No. Observations:	30	AIC:	258.3
Df Residuals:	28	BIC:	261.1
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	30.4516	6.495	4.689	0.000	17.148	43.755
x1	-0.2513	0.367	-0.684	0.500	-1.004	0.501

Omnibus:	17.131	Durbin-Watson:	1.667
Prob(Omnibus):	0.000	Jarque-Bera (JB):	20.235
Skew:	1.524	Prob(JB):	4.04e-05
Kurtosis:	5.626	Cond. No.	36.3

Dep. Variable:	y	R-squared:	0.008
Model:	OLS	Adj. R-squared:	-0.027
Method:	Least Squares	F-statistic:	0.2311
Date:	Mon, 16 May 2022	Prob (F-statistic):	0.634
Time:	18:03:24	Log-Likelihood:	-127.25
No. Observations:	30	AIC:	258.5
Df Residuals:	28	BIC:	261.3
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	16.788	Durbin-Watson:	1.628
Prob(Omnibus):	0.000	Jarque-Bera (JB):	19.423
Skew:	1.514	Prob(JB):	6.06e-05
Kurtosis:	5.524	Cond. No.	37.9

Dep. Variable:	y	R-squared:	0.444
Model:	OLS	Adj. R-squared:	0.424
Method:	Least Squares	F-statistic:	22.33
Date:	Mon, 16 May 2022	Prob (F-statistic):	5.88e-05
Time:	18:03:24	Log-Likelihood:	-98.384
No. Observations:	30	AIC:	200.8
Df Residuals:	28	BIC:	203.6
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	2.173	Durbin-Watson:	1.041
Prob(Omnibus):	0.337	Jarque-Bera (JB):	0.996
Skew:	0.129	Prob(JB):	0.608
Kurtosis:	3.854	Cond. No.	37.9

Dep. Variable:	y	R-squared:	0.061
Model:	OLS	Adj. R-squared:	0.027
Method:	Least Squares	F-statistic:	1.820
Date:	Mon, 16 May 2022	Prob (F-statistic):	0.188
Time:	18:03:24	Log-Likelihood:	-68.432
No. Observations:	30	AIC:	140.9
Df Residuals:	28	BIC:	143.7
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	4.799	Durbin-Watson:	1.991
Prob(Omnibus):	0.091	Jarque-Bera (JB):	2.486
Skew:	0.450	Prob(JB):	0.288
Kurtosis:	1.913	Cond. No.	37.9

Omnibus:	4.213	Durbin-Watson:	2.009
Prob(Omnibus):	0.122	Jarque-Bera (JB):	2.986
Skew:	0.616	Prob(JB):	0.225
Kurtosis:	2.067	Cond. No.	36.3