Source code for nys_parole_scraper.scraper_functions

# -*- coding: utf-8 -*-
"""
@author: khayes
"""
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

#=============================================================================
# FUNCTION: SCRAPE TABLE 1
#=============================================================================

[docs]def scrape_table1(driver, df): #global df ##Table 1 - "ParoleeInformation" tbl1= driver.find_element(By.XPATH, "//*[@id='MainContent_paroleeInformation']" ).get_attribute('outerHTML') df = pd.read_html(tbl1) df = df[0] df = df.transpose() df.columns = df.iloc[0] df = df[1:] return df
#============================================================================= # FUNCTION: SCRAPE TABLE 2 #=============================================================================
[docs]def scrape_table2(driver, df): ##If Table 2 (supervision info) exists, scrape: tbl2= driver.find_element(By.XPATH, "//*[@id='MainContent_supervisionInformation']" ).get_attribute('outerHTML') df2 = pd.read_html(tbl2) df2 = df2[0] df2 = df2.transpose() df2.columns = df2.iloc[0] df2 = df2[1:] df = pd.merge(df, df2, how="cross") return df
#============================================================================= # FUNCTION: SCRAPE TABLE 3 #=============================================================================
[docs]def scrape_table3(driver, df, uid): #if offense table exists: tbl3= driver.find_element(By.XPATH, "//*[@id='MainContent_offenseInformationTable']" ).get_attribute('outerHTML') df3 = pd.read_html(tbl3) df3 = df3[0] df3 = df3.transpose() df3 = df3.reset_index() df3 = df3.drop(columns=['index']) dic = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: []} ################## Row 1 - Charges df3_row1 = pd.DataFrame(data = dic) df3_row1 = df3_row1.append(df3.iloc[[0]]) df3_row1.rename(columns={0: 'Crime of conviction 1', 1: 'Crime of conviction 2', 2: 'Crime of conviction 3', 3: 'Crime of conviction 4', 4: 'Crime of conviction 5', 5: 'Crime of conviction 6', 6: 'Crime of conviction 7', 7: 'Crime of conviction 8', 8: 'Crime of conviction 9', 9: 'Crime of conviction 10'}, inplace=True) df = pd.merge(df, df3_row1, how="cross") #################### Row 2 - Classes df3_row2 = pd.DataFrame(data = dic) df3_row2 = df3_row2.append(df3.iloc[[1]]) df3_row2.rename(columns={0: 'Class 1', 1: 'Class 2', 2: 'Class 3', 3: 'Class 4', 4: 'Class 5', 5: 'Class 6', 6: 'Class 7', 7: 'Class 8', 8: 'Class 9', 9: 'Class 10'}, inplace=True) df = pd.merge(df, df3_row2, how="cross") ######################## Row 3 - Boroughs df3_row3 = pd.DataFrame(data = dic) df3_row3 = df3_row3.append(df3.iloc[[2]]) df3_row3.rename(columns={0: 'County 1', 1: 'County 2', 2: 'County 3', 3: 'County 4', 4: 'County 5', 5: 'County 6', 6: 'County 7', 7: 'County 8', 8: 'County 9', 9: 'County 10'}, inplace=True) df = pd.merge(df, df3_row3, how="cross") #df["parid"] = uid df.insert(loc = 0, column = 'ID', value = uid) return df
#========================================================== # DEFINE NEW SEARCH #========================================================== #========================================================== # DEFINE FREQUENCY TABLE FUNCTION #==========================================================
[docs]def freq_table(df, column, col_name): """ Create a frequency table. Parameters ---------- df : pandas DataFrame the dataFrame witht he data you would like to create a frequency table of column : String The column in the dataframe you would like to make a frequency table of. col_name : String The name you would like to give to the first column of the returned DataFrame Returns ------- To return the following python objects as DataFrames, assign the function to two variables (ex: df1, df2 = parole_scraper(file_path, directory)). To only export the dataframes to the provided directory, you do not need to assign the function to variables (ex: parole_scraper(file_path, directory)) df : console output or pandas DataFrame if assigned to variable Example -------- Returning DataFrames object: >>> from nys_parole_scraper import scraper_functions as sf >>> data = {'animal':['Cat', 'Dog', 'Cat', 'Dog', 'Bird'], 'color':['Grey', 'Brown', 'Black', 'Brown', 'White'], 'weight' : [1,2,3.4,4,1] } >>> df = pd.DataFrame(data) >>> animal_frequency = sf.freq_table(df, 'animal', 'Animals') animal_frequency Animals | Count | % ------------------------ Cat | 2 | 40.0% Dog | 2 | 40.0% Bird | 1 | 20.0% Returning frequency table only in console output: >>> from nys_parole_scraper import scraper_functions as sf >>> data = {'animal':['Cat', 'Dog', 'Cat', 'Dog', 'Bird'], 'color':['Grey', 'Brown', 'Black', 'Brown', 'White'], 'weight' : [1,2,3.4,4,1] } >>> df = pd.DataFrame(data) >>> sf.freq_table(df, 'animal', 'Animals') Animals | Count | % ------------------------ Cat | 2 | 40.0% Dog | 2 | 40.0% Bird | 1 | 20.0% """ if not isinstance(df, pd.DataFrame): raise ValueError('column argument must be a string') if not isinstance(column, str): raise ValueError('column argument must be a string') if not isinstance(col_name, str): raise ValueError('col_name argument must be a string') c = df[column].value_counts(dropna=False) p = df[column].value_counts(dropna=False, normalize=True).mul(100).round(1).astype(str) + '%' df_new = pd.concat([c,p], axis=1, keys=['Count', '%']) df_new.index.rename(col_name, inplace=True) df_new.reset_index(inplace = True) return df_new