├── source/
│ ├── data/
Note: The code examples below are simplified demonstrations for illustrative purposes only. For the complete, production-ready implementation, please refer to the Codebase Reference section.
# Import required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
def scrape_phd_data(url):
"""Scrape PhD candidate data from university websites"""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Example extraction pattern
candidates = []
for profile in soup.select('.faculty-listing'):
name = profile.find('h3').text.strip()
department = profile.find('div', class_='dept').text
candidates.append({'name': name, 'department': department})
return pd.DataFrame(candidates)
# Gender detection function
def detect_gender(name):
from gender_guesser.detector import Detector
d = Detector()
first_name = name.split()[0]
return d.get_gender(first_name)
# Import and merge university datasets
def merge_university_data():
# Load individual university datasets
harvard_df = pd.read_csv('data/harvard_candidates.csv')
stanford_df = pd.read_csv('data/stanford_candidates.csv')
yale_df = pd.read_csv('data/yale_candidates.csv')
# Add university column to each dataset
harvard_df['university'] = 'Harvard University'
stanford_df['university'] = 'Stanford University'
yale_df['university'] = 'Yale University'
# Merge all dataframes
merged_df = pd.concat([
harvard_df,
stanford_df,
yale_df
], ignore_index=True)
# Standardize column names
merged_df.columns = merged_df.columns.str.lower()
# Add gender classification
merged_df['gender'] = merged_df['name'].apply(detect_gender)
return merged_df
| Institution | File Type | Code Artifact | Description |
|---|---|---|---|
| University of Pennsylvania | 📒 Notebook | process_old.ipynb |
Notebook containing steps to scrape PhD placement data |
| UC Berkeley | 📒 Notebook | process_old.ipynb |
Notebook containing steps to scrape PhD placement data |
| Yale | 📒 Notebook | process_old.ipynb |
Notebook containing steps to scrape PhD placement data |
| Columbia University | 📒 Notebook | process_old.ipynb |
Notebook containing steps to scrape PhD placement data |
| UCSD | 📒 Notebook | new_steps.ipynb |
Notebook containing steps to scrape PhD placement data |
| UC Riverside | 📒 Notebook | new_steps.ipynb |
Notebook containing steps to scrape PhD placement data |
| UC Davis | 📒 Notebook | new_steps.ipynb |
Notebook containing steps to scrape PhD placement data |
| Stanford | 📒 Notebook | new_steps.ipynb |
Notebook containing steps to scrape PhD placement data |
| Harvard | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| NYU | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| Duke | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| UMich | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| Brown | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| JHU | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| UT Austin | 📒 Notebook | scrape_missing.ipynb |
Notebook containing steps to scrape PhD placement data |
| All Universities | 📒 Notebook | dataset_combine.ipynb |
Combines all university datasets into final dataset |
| Analysis | 📒 Notebook | final_analysis.ipynb |
Contains all our data analysis and visualization code |
beautifulsoup4==4.12.2
pandas==2.0.3
requests==2.31.0
selenium==4.10.0
gender-guesser==0.4.0
jupyterlab==4.0.4