# -*- coding: utf-8 -*-
"""
@author: khayes
"""
# import libraries
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
import numpy as np
import os
from datetime import date
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from nys_parole_scraper import scraper_functions as sf
# changed from just " import scraper_functions as sf "
### making this change, and moving all files from src>nys_parole_scraper to just src, and then importing scraper as "from nys_parole_scraper.nys_parole_scraper import parole_scraper" (alternatively could do "from nys_scraper import nys_scraper" and then use nys_scraper.parole_scraper as function call). This change allows both this to work as well as pytest
from selenium.webdriver.firefox.options import Options
[docs]def parole_scraper(file_path, directory):
"""
Scrapes NYS parole information from the NYS DOCCS parolee lookup website
based on user inputs and returns a clean dataset and summary statistics.
Parameters
----------
file_path : String
A string representing the path of the CSV file that includes the identifying
information of the individuals to be searched for with the scraper. See
Readme for more information of the input csv file construction.
directory : String
A string representing the folder path where the user would like an output
folder created, in which the final dataset and summary statistics will
be exported.
Returns
-------
To return the following python objects as DataFrames, assign the function to
two variables (ex: df1, df2 = parole_scraper(file_path, directory)). To only export
the dataframes to the provided directory, you do not need to assign the
function to variables (ex: parole_scraper(file_path, directory))
full_output : pandas DataFrame, CSV
The concatenated parole information of all provided individuals found
in the DOCCS parole database. The returned dataframe will take on
whatever name you assign first to the the function.
Example: df1, df2 = parole_scraper(file_path, directory)
full_output will be returned as df1
full_putput will also be exported as a csv to an output folder in the
directory provided by the directory parameter.
stats_list: List, Excel
A list of the Following DataFrames, returned as the second varible assigned
to the function call. All summary statistics and frequency table objects
will be exported in separate sheets of an Excel file in the same output
folder.
stats_numeric : pandas DataFrame
Summary statistics on age; length of time between release to
parole and current date in months; and number of convictions
race_freq : pandas DataFrame
Frequency table of race/ethncity
age_freq : pandas DataFrame
Frequency table of age groups
pstatus_freq : pandas DataFrame
Frequency table of current parole status
county_unique_freq : pandas DataFrame
Frequency table of unique individuals with convictions in each county
top_charge_freq : pandas DataFrame
Frequency table of top charges
convictions_freq : pandas DataFrame
Frequency table of unique individuals per charge type (every
conviction, not only top charge)
Example
--------
Returning DataFrames as objects:
>>> from parole_scrpaer_MDS import parole_scraper
>>> file_path = "C:/Users/parole_scraping.csv"
>>> directory = "C:/Users/Output_Folder"
>>> df1, df_list = parole_scraper(file_path, directory)
df1 = full_output
df_list = stats_list
full_output exported as CSV
stats_list exported as Excel
Without returning DataFrames as objects:
>>> from parole_scrpaer_MDS import parole_scraper
>>> file_path = "C:/Users/parole_scraping.csv"
>>> directory = "C:/Users/Output_Folder"
>>> parole_scraper(file_path, directory)
full_output exported as CSV
stats_list exported as Excel
"""
if isinstance(file_path, int):
raise ValueError('File_path argument must be a string')
if isinstance(file_path, float):
raise ValueError('File_path argument must be a string')
if isinstance(directory, int):
raise ValueError('directory argument must be a string')
if isinstance(directory, float):
raise ValueError('directory argument must be a string')
assert os.path.isfile(file_path) != False, "file_path argument must be a path to an existing file. Please check your input."
assert os.path.isdir(directory) != False, "directory argument must be the path to an existing directory. Please check your input."
#create date and time variable and make directory name
now = datetime.now()
dyn_dir_name = now.strftime("%Y%m%d_%H.%M.%S")
# create today variables for adding to dataset and computing date differences
now_today = date.today()
today_str = now_today.strftime("%m/%d/%Y")
today = datetime.strptime(today_str, "%m/%d/%Y")
#create directory
global output_dir
output_dir = directory+"/Output_"+dyn_dir_name
if not os.path.exists(output_dir):
os.makedirs(output_dir)
#Import file with scraping info as csv
data_csv = pd.read_csv(file_path, delimiter=',', dtype='str')
#strip white spaces
for col in ('NYSID',
'First Name',
'Last Name',
'DOB',
'Unique ID'):
data_csv[col] = data_csv[col].astype(str).str.strip()
#rename columns
data_csv = data_csv.rename(columns={'First Name': 'first', 'Last Name': 'last', 'DOB': 'dob', 'Unique ID': 'id'})
# create year column
data_csv['year'] = data_csv['dob'].str.extract(r'(\d\d\d\d)$')
# create df of nysids and id only
nysids_start = data_csv.drop(columns=['first', 'last', 'dob'])
# keep only NYSIDS with 9 didgits followed by letter
nysids = nysids_start[nysids_start['NYSID'].str.contains(r'^\d\d\d\d\d\d\d\d[A-Za-z]$')]
# keep only NYSIDS that do not have "00000" - these are fake nysids given by state when NYSID is unknown
nysids = nysids[nysids['NYSID'].str.contains(r'^(?!00000)')]
# outer merge with data_csv
outer_join = data_csv.merge(nysids, how = 'outer', indicator = True)
# anti-join to get names that need to be scraped
names = outer_join[~(outer_join._merge == 'both')].drop('_merge', axis = 1)
# specify the url (DOCCS)
urlpage = 'https://publicapps.doccs.ny.gov/ParoleeLookup/default'
# run firefox webdriver from executable path
global driver
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
# get web page
driver.get(urlpage)
# sleep for 5s to allow page to fully load
time.sleep(5)
# create emtpy df and list to append individuals' scraped data to as dataframes
global dataframe
dataframe = pd.DataFrame()
global df_list
df_list = []
# global wait
global wait
wait = WebDriverWait(driver, 30)
#=========================================================================
#LOOP BEGINS HERE
#=========================================================================
#Wait until page is fully loaded and nysid field is available
for x,e in zip(nysids['NYSID'], nysids['id']):
wait.until(EC.visibility_of_element_located((
By.XPATH, "//*[@id='MainContent_txtNysid']")))
#find nysid entry by xpath
nysid_search = driver.find_element(
By.XPATH, "//*[@id='MainContent_txtNysid']")
# find submit button by xpath_homepage
submit = driver.find_element(By.XPATH,
"//*[@id='MainContent_BtnSubmit']")
# Enter nysid in first searchbar
nysid_search.send_keys(x)
# Click Submit
submit.click()
#Check if nysid was not found
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_paroleeInformation']")
except NoSuchElementException:
wait.until(EC.visibility_of_element_located((
By.XPATH, "//*[@id='MainContent_NewSearch']")))
newsearch = driver.find_element(By.XPATH, "//*[@id='MainContent_NewSearch']")
newsearch.click()
continue
#==========================================================================
## SCRAPE DATA:
#==========================================================================
##Table 1 - "ParoleeInformation"
dataframe = sf.scrape_table1(driver, dataframe)
##Table 2 - "SupervisionInformation"
#######Check that Table 2 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_supervisionInformation']/tbody/tr[1]/td[1]")
except NoSuchElementException:
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException: # if neither supervision nor offense tables exist
#export_nysid_df(dataframe)
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#if offense table exists:
dataframe = sf.scrape_table3(driver, dataframe, e)
#export to csv:
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
##If Table 2 (supervision info) exists, scrape:
dataframe = sf.scrape_table2(driver, dataframe)
##Table 3 - "OffenseInformation"
#######Check that Table 3 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException:
#export
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#If Table 3 exists, scrape:
dataframe = sf.scrape_table3(driver, dataframe, e)
#####Done Scraping
df_list.append(dataframe)
sf.new_search(driver, wait)
#==============================================================================
# LOOP THROUGH NAMES
#==============================================================================
for a,b,c,d,e in zip(names['first'], names['last'], names['dob'], names['year'], names['id']):
wait.until(EC.visibility_of_element_located((
By.XPATH, "//*[@id='MainContent_txtFName']")))
#find first name entry by xpath
first_search = driver.find_element(
By.XPATH, "//*[@id='MainContent_txtFName']")
#find last name entry by xpath
last_search = driver.find_element(
By.XPATH, "//*[@id='MainContent_txtLName']")
#find last name entry by xpath
year_search = driver.find_element(By.XPATH, "//*[@id='MainContent_txtYob']")
# find submit button by xpath_homepage
submit = driver.find_element(By.XPATH,
"//*[@id='MainContent_BtnSubmit']")
# Enter first name in first searchbar
first_search.send_keys(a)
# Enter last name in first searchbar
last_search.send_keys(b)
# Enter year in year searchbar
year_search.send_keys(d)
# Click Submit
submit.click()
#Check if name was not found
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_manyResultsDiv']")
except NoSuchElementException:
sf.new_search(driver, wait)
continue
try: # check if there is a DOB match on page 1 of results
check = driver.find_element(By.XPATH, "//tr[td='" + c + "']")
except NoSuchElementException:
try: # if no DOB match, check if page 2 exists
wait.until(EC.visibility_of_element_located((
By.XPATH, "//*[@id='MainContent_navRowTable']")))
check = driver.find_element(By.LINK_TEXT, '2')
except NoSuchElementException: # if page 2 does not exist, restart loop
sf.new_search(driver, wait)
continue
# if page 2 exists, go to page 2
page = driver.find_element(By.LINK_TEXT, '2').click()
# check if there are DOB matches on page two
try:
check = driver.find_element(By.XPATH, "//tr[td='" + c + "']")
except NoSuchElementException:
try: # if no DOB match, check if page 3 exists
wait.until(EC.visibility_of_element_located((
By.XPATH, "//*[@id='MainContent_navRowTable']")))
check = driver.find_element(By.LINK_TEXT, '3')
except NoSuchElementException: # if page 3 does not exist, restart loop
sf.new_search(driver, wait)
continue
# if page 3 exists, go to page 3
page = driver.find_element(By.LINK_TEXT, '3').click()
# check if there are DOB matches on page 3
try:
check = driver.find_element(By.XPATH, "//tr[td='" + c + "']")
except NoSuchElementException:
try: # if no DOB match, check if page 4 exists
wait.until(EC.visibility_of_element_located((
By.XPATH, "//*[@id='MainContent_navRowTable']")))
check = driver.find_element(By.LINK_TEXT, '4')
except NoSuchElementException: # if page 4 does not exist, restart loop
sf.new_search(driver, wait)
continue
# if page 4 exists, go to page 4
page = driver.find_element(By.LINK_TEXT, '4').click()
try:
check = driver.find_element(By.XPATH, "//tr[td='" + c + "']")
except NoSuchElementException: # if DOB match not found continue
sf.new_search(driver, wait)
continue
#=============================================================
#SCRAPE RESULT FROM PAGE 4
#=============================================================
#click on name
link = driver.find_element(By.XPATH, "//tr[td='" + c + "']/td/preceding-sibling::td[3]")
link.click()
##Table 1 - "ParoleeInformation"
dataframe = sf.scrape_table1(driver, dataframe)
##Table 2 - "SupervisionInformation"
#######Check that Table 2 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_supervisionInformation']/tbody/tr[1]/td[1]")
except NoSuchElementException:
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException: # if neither supervision nor offense tables exist
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#if offense table exists:
dataframe = sf.scrape_table3(driver, dataframe, e)
#export to csv:
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
##If Table 2 (supervision info) exists, scrape:
dataframe = sf.scrape_table2(driver, dataframe)
##Table 3 - "OffenseInformation"
#######Check that Table 3 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException:
#export
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#If Table 3 exists, scrape:
dataframe = sf.scrape_table3(driver, dataframe, e)
#####Done Scraping
#export
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#=============================================================
#SCRAPE RESULT FROM PAGE 3
#=============================================================
#click on name
link = driver.find_element(By.XPATH, "//tr[td='" + c + "']/td/preceding-sibling::td[3]")
link.click()
##Table 1 - "ParoleeInformation"
dataframe = sf.scrape_table1(driver, dataframe)
##Table 2 - "SupervisionInformation"
#######Check that Table 2 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_supervisionInformation']/tbody/tr[1]/td[1]")
except NoSuchElementException:
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException: # if neither supervision nor offense tables exist
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#if offense table exists:
dataframe = sf.scrape_table3(driver, dataframe, e)
#export to csv:
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
##If Table 2 (supervision info) exists, scrape:
dataframe = sf.scrape_table2(driver, dataframe)
##Table 3 - "OffenseInformation"
#######Check that Table 3 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException:
#export
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#If Table 3 exists, scrape:
dataframe = sf.scrape_table3(driver, dataframe, e)
#####Done Scraping
#export
#export_name_df()
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#=============================================================
#SCRAPE RESULT FROM PAGE 2
#=============================================================
#click on name
link = driver.find_element(By.XPATH, "//tr[td='" + c + "']/td/preceding-sibling::td[3]")
link.click()
##Table 1 - "ParoleeInformation"
dataframe = sf.scrape_table1(driver, dataframe)
##Table 2 - "SupervisionInformation"
#######Check that Table 2 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_supervisionInformation']/tbody/tr[1]/td[1]")
except NoSuchElementException:
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException: # if neither supervision nor offense tables exist
#export_name_df()
df_list.append(dataframe)
# #Go back to home page
sf.new_search(driver, wait)
continue
#if offense table exists:
dataframe = sf.scrape_table3(driver, dataframe, e)
#export to csv:
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
##If Table 2 (supervision info) exists, scrape:
dataframe = sf.scrape_table2(driver, dataframe)
##Table 3 - "OffenseInformation"
#######Check that Table 3 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException:
#export
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#If Table 3 exists, scrape:
dataframe = sf.scrape_table3(driver, dataframe, e)
#####Done Scraping
#export
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#=============================================================
#SCRAPE RESULT FROM PAGE 1
#=============================================================
#click on name
link = driver.find_element(By.XPATH, "//tr[td='" + c + "']/td/preceding-sibling::td[3]")
link.click()
##Table 1 - "ParoleeInformation"
dataframe = sf.scrape_table1(driver, dataframe)
##Table 2 - "SupervisionInformation"
#######Check that Table 2 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_supervisionInformation']/tbody/tr[1]/td[1]")
except NoSuchElementException:
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException: # if neither supervision nor offense tables exist
#export_name_df()
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
#if offense table exists:
dataframe = sf.scrape_table3(driver, dataframe, e)
#export to csv:
df_list.append(dataframe)
#Go back to home page
sf.new_search(driver, wait)
continue
##If Table 2 (supervision info) exists, scrape:
dataframe = sf.scrape_table2(driver, dataframe)
##Table 3 - "OffenseInformation"
#######Check that Table 3 exists
try:
check = driver.find_element(By.XPATH,
"//*[@id='MainContent_offenseInformationTable']/tbody/tr[2]/td[1]")
except NoSuchElementException:
#export
df_list.append(dataframe)
# find new search button by xpath
sf.new_search(driver, wait)
continue
#If Table 3 exists, scrape:
dataframe = sf.scrape_table3(driver, dataframe, e)
#####Done Scraping
#export
df_list.append(dataframe)
# find new search button by xpath
sf.new_search(driver, wait)
#=============================================================================
# END OF LOOPS - CONCAT AND EXPORT AS CSV
#=============================================================================
#end loop; close driver
driver.close()
driver.quit()
# concatenate all nysid dfs
global full_output
full_output = pd.concat(df_list)
#Capitalize certain columns
for col in ('Name:',
'Parole status:',
'Class 1',
'Class 2',
'Class 3',
'Class 4',
'Class 5',
'Class 6',
'Class 7',
'Class 8',
'Class 9',
'Class 10',
'County 1',
'County 2',
'County 3',
'County 4',
'County 5',
'County 6',
'County 7',
'County 8',
'County 9',
'County 10'):
full_output[col] = full_output[col].fillna("")
full_output[col] = full_output[col].astype(str).str.title()
# Add column indicating day data was scraped
full_output["Date Info Scraped:"] = today #today_str !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# #Turn dates into datetime objects
# full_output[["Date of birth:", "Release to parole supervision:"]]= full_output[["Date of birth:", "Release to parole supervision:"]].astype('datetime64[ns]')
full_output[["Date of birth:", "Release to parole supervision:", "Effective date:"]]= full_output[["Date of birth:", "Release to parole supervision:","Effective date:"]].astype('datetime64[ns]')
# Add age column
idob = full_output.columns.get_loc("Date of birth:") # get index location of DOB
age = (today - full_output["Date of birth:"]) / np.timedelta64(1, 'Y') # get series of ages
age = np.floor(age) # get actual age, no decimals
full_output.insert(loc = idob +1,
column = 'Age:',
value = age) #insert new age column directly after DOB column
# Add months since release column
irel = full_output.columns.get_loc("Release to parole supervision:") # get index location of release date
mrelease = (today - full_output["Release to parole supervision:"]) / np.timedelta64(1, 'M') # get series
mrelease = np.around(mrelease, decimals=1, out=None) # round to 1 decimal
full_output.insert(loc = irel +1,
column = "Months Since Release:",
value = mrelease) #insert new column directly after release date
iconv = full_output.columns.get_loc("Crime of conviction 10")
count_conv = full_output[['Crime of conviction 1',
'Crime of conviction 2',
'Crime of conviction 3',
'Crime of conviction 4',
'Crime of conviction 5',
'Crime of conviction 6',
'Crime of conviction 7',
'Crime of conviction 8',
'Crime of conviction 9',
'Crime of conviction 10']].apply(lambda x: x.notnull().sum(), axis='columns')
full_output.insert(loc = iconv +1,
column = "Total Convictions",
value = count_conv) # insert total after last conviction column
# make null IDs empty cells
full_output['ID'] = np.where(
full_output['ID'] == "nan", '', full_output['ID'])
# make new dataframe for summary stats
output_stats = full_output.copy()
# change empoty spaces back to nans
full_output = full_output.replace(r'^\s*$', np.nan, regex=True)
#export to csv
full_output.to_csv(output_dir + '/parole_full_output_'+ dyn_dir_name +'.csv', index=False, encoding="utf-8")
#=============================================================================
# SUMMARY STATISTICS
#=============================================================================
# summary stats on continuous variables
global stats_numeric
stats_numeric = full_output[["Age:", "Months Since Release:","Total Convictions"]].describe().loc[['count','min','mean','max']]
# create age bins
bins = [0,20,30,40,50,60,150]
group_names=['0 - 20','21-30','31-40','41-50','51-60','60 +']
output_stats['age_bin']=pd.cut(output_stats['Age:'],bins,labels=group_names).astype(object)
global age_freq
age_freq = sf.freq_table(output_stats, "age_bin", "Age")
# get frequency of unique individuals with cases in different counties
county_list = ['County 1',
'County 2',
'County 3',
'County 4',
'County 5',
'County 6',
'County 7',
'County 8',
'County 9',
'County 10']
output_stats[county_list] = output_stats[county_list].fillna("")
counties = np.unique(output_stats[county_list].values)
counties = [item for item in counties if item != ""]
global county_unique_freq
county_unique_freq = pd.DataFrame(columns=['Count', '%'], index = counties)
county_unique_freq.index.rename("Counties All Convictions", inplace = True)
for county in counties:
output_stats[county+'_unique'] = output_stats[['County 1',
'County 2',
'County 3',
'County 4',
'County 5',
'County 6',
'County 7',
'County 8',
'County 9',
'County 10']].apply(
lambda x: x.str.contains(county ,case=False)).any(axis=1).astype(int)
c = output_stats[county+'_unique'].sum()
p = str(round((c/len(output_stats.index))*100, 1))+ '%'
county_unique_freq.loc[county].Count = c
county_unique_freq.loc[county]['%'] = p
county_unique_freq.reset_index(inplace = True)
# frequency tables for race/ethnicity
global race_freq
race_freq = sf.freq_table(output_stats, "Race / ethnicity:", "Race/Ethnicity")
# frequency tables for parole status
global pstatus_freq
pstatus_freq = sf.freq_table(output_stats, "Parole status:", "Parole Status")
# get counts of charges
conv_list = ['Crime of conviction 1',
'Crime of conviction 2',
'Crime of conviction 3',
'Crime of conviction 4',
'Crime of conviction 5',
'Crime of conviction 6',
'Crime of conviction 7',
'Crime of conviction 8',
'Crime of conviction 9',
'Crime of conviction 10']
output_stats[conv_list] = output_stats[conv_list].fillna("")
convictions = np.unique(output_stats[conv_list].values)
convictions = [item for item in convictions if item != ""]
global convictions_freq
convictions_freq = pd.DataFrame(columns=['Count', '%'], index = convictions)
convictions_freq.index.rename("Charges All Convictions", inplace = True)
for conv in convictions:
output_stats[conv+'_unique'] = output_stats[['Crime of conviction 1',
'Crime of conviction 2',
'Crime of conviction 3',
'Crime of conviction 4',
'Crime of conviction 5',
'Crime of conviction 6',
'Crime of conviction 7',
'Crime of conviction 8',
'Crime of conviction 9',
'Crime of conviction 10']].apply(
lambda x: x.str.contains(conv ,case=False)).any(axis=1).astype(int)
c = output_stats[conv+'_unique'].sum()
p = str(round((c/len(output_stats.index))*100, 1))+ '%'
convictions_freq.loc[conv].Count = c
convictions_freq.loc[conv]['%'] = p
convictions_freq.reset_index(inplace = True)
#get top charge (crime of conviction #1) fequency table
global top_charge_freq
top_charge_freq = sf.freq_table(output_stats, 'Crime of conviction 1', "Top Charge")
##### Export --------------------------------------------------------------
# Create a Pandas Excel writer
writer = pd.ExcelWriter(output_dir + '/summary_statistics_'+ dyn_dir_name + '.xlsx', engine='xlsxwriter')
# Write each dataframe to a different worksheet.
stats_numeric.to_excel(writer, sheet_name='Summary Statistics', index = False)
age_freq.to_excel(writer, sheet_name='Age', index = False)
race_freq.to_excel(writer, sheet_name='Race.Ethnicity', index = False)
pstatus_freq.to_excel(writer, sheet_name='Parole Status', index = False)
county_unique_freq.to_excel(writer, sheet_name='Unique People per County',
index = False)
top_charge_freq.to_excel(writer, sheet_name='Top Charge', index = False)
convictions_freq.to_excel(writer, sheet_name='Unique People per Charge Type',
index = False)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
#create list with stats dataframes
stats_list = [stats_numeric,
race_freq,
age_freq,
pstatus_freq,
county_unique_freq,
top_charge_freq,
convictions_freq]
return full_output, stats_list