IMPORTS & DATA CLEANING
import pandas as pd
import numpy as np
import requests
import folium
from folium.plugins import HeatMap
import matplotlib.pyplot as plt
API REQUEST
response = requests.get("https://data.cityofnewyork.us/resource/uip8-fykc.json?$limit=60000")
url = 'https://data.cityofnewyork.us/resource/uip8-fykc.json'
params = {
'$select': 'arrest_date,arrest_boro,arrest_precinct,age_group,perp_sex,perp_race,ofns_desc,latitude,longitude',
'$limit':60000
}
response = requests.get(url, params=params)
data = response.json()
df1 = pd.json_normalize(data)
df1.shape
(54576, 9)
# Set the date range for the data you want to retrieve
start_date = '2018-01-01T00:00:00.000'
end_date = '2023-05-04T23:59:59.999'
# Build the API request with the date filter
url = 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json'
params = {
'$select': 'arrest_date,arrest_boro,arrest_precinct,age_group,perp_sex,perp_race,ofns_desc,latitude,longitude',
'$where': f"arrest_date between '{start_date}' and '{end_date}'",
'$limit':5000000
}
response = requests.get(url, params=params)
data = response.json()
# Convert the data to a pandas dataframe
df2 = pd.json_normalize(data)
df2.shape
(947084, 9)
#print(response.content)
#df1=pd.json_normalize(response.json())
#df1 = pd.DataFrame(response.json()[1:], columns=response.json()[0])
#print(df1.head())
df1
arrest_date | arrest_boro | arrest_precinct | age_group | perp_sex | perp_race | ofns_desc | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | 2023-02-07T00:00:00.000 | S | 120 | 25-44 | M | BLACK | FELONY ASSAULT | 40.636362 | -74.117505 |
1 | 2023-01-04T00:00:00.000 | M | 34 | 25-44 | F | BLACK | (null) | 40.856519 | -73.927564 |
2 | 2023-01-10T00:00:00.000 | B | 42 | 18-24 | F | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.822355 | -73.911463 |
3 | 2023-02-27T00:00:00.000 | Q | 113 | 25-44 | M | BLACK | (null) | 40.66588105 | -73.80183654 |
4 | 2023-01-07T00:00:00.000 | M | 18 | <18 | M | BLACK | ROBBERY | 40.76539 | -73.985702 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
54571 | 2023-03-20T00:00:00.000 | Q | 113 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.680077 | -73.775929 |
54572 | 2023-03-20T00:00:00.000 | M | 33 | 18-24 | M | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.837374 | -73.941122 |
54573 | 2023-03-13T00:00:00.000 | Q | 113 | 18-24 | M | WHITE | GRAND LARCENY | 40.680077 | -73.775929 |
54574 | 2023-03-17T00:00:00.000 | Q | 112 | 65+ | M | ASIAN / PACIFIC ISLANDER | ASSAULT 3 & RELATED OFFENSES | 40.709999 | -73.853973 |
54575 | 2023-03-31T00:00:00.000 | Q | 114 | 25-44 | M | WHITE | DANGEROUS WEAPONS | 40.7714791 | -73.91823339 |
54576 rows × 9 columns
df1['arrest_date'] = pd.to_datetime(df1['arrest_date'])
df1['arrest_date'] = df1['arrest_date'].dt.strftime('%B %d, %Y')
print(df1['arrest_date'])
0 February 07, 2023 1 January 04, 2023 2 January 10, 2023 3 February 27, 2023 4 January 07, 2023 ... 54571 March 20, 2023 54572 March 20, 2023 54573 March 13, 2023 54574 March 17, 2023 54575 March 31, 2023 Name: arrest_date, Length: 54576, dtype: object
df2
arrest_date | arrest_boro | arrest_precinct | age_group | perp_sex | perp_race | ofns_desc | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01T00:00:00.000 | K | 66 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.62572501600005 | -73.99104937999994 |
1 | 2018-01-01T00:00:00.000 | M | 17 | 25-44 | F | BLACK | DANGEROUS DRUGS | 40.752228400000035 | -73.97192485799998 |
2 | 2018-01-01T00:00:00.000 | S | 121 | <18 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.63194388100004 | -74.16170488099993 |
3 | 2018-01-01T00:00:00.000 | B | 40 | 45-64 | M | BLACK HISPANIC | INTOXICATED/IMPAIRED DRIVING | 40.81553235500007 | -73.91861784599996 |
4 | 2018-01-01T00:00:00.000 | B | 46 | 18-24 | M | BLACK HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.84643225400004 | -73.91104764699998 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
947079 | 2022-12-31T00:00:00.000 | M | 7 | 18-24 | M | WHITE | ASSAULT 3 & RELATED OFFENSES | 40.718911 | -73.98939 |
947080 | 2022-12-31T00:00:00.000 | K | 73 | 25-44 | M | BLACK | CRIMINAL MISCHIEF & RELATED OF | 40.666387 | -73.90458 |
947081 | 2022-12-31T00:00:00.000 | M | 32 | 25-44 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.822332 | -73.936708 |
947082 | 2022-12-31T00:00:00.000 | M | 24 | 25-44 | M | WHITE HISPANIC | GRAND LARCENY | 40.794146 | -73.968443 |
947083 | 2022-12-31T00:00:00.000 | M | 24 | 25-44 | M | BLACK | MISCELLANEOUS PENAL LAW | 40.789701 | -73.967972 |
947084 rows × 9 columns
df1 = df1.rename(columns={
'arrest_boro': 'Borough',
'ofns_desc': 'Offense',
'perp_race': 'Race',
'perp_sex': 'Sex',
'arrest_date':'Date',
'age_group':'Age Group',
'arrest_precinct':'Precinct'
})
df1
Date | Borough | Precinct | Age Group | Sex | Race | Offense | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | February 07, 2023 | S | 120 | 25-44 | M | BLACK | FELONY ASSAULT | 40.636362 | -74.117505 |
1 | January 04, 2023 | M | 34 | 25-44 | F | BLACK | (null) | 40.856519 | -73.927564 |
2 | January 10, 2023 | B | 42 | 18-24 | F | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.822355 | -73.911463 |
3 | February 27, 2023 | Q | 113 | 25-44 | M | BLACK | (null) | 40.66588105 | -73.80183654 |
4 | January 07, 2023 | M | 18 | <18 | M | BLACK | ROBBERY | 40.76539 | -73.985702 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
54571 | March 20, 2023 | Q | 113 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.680077 | -73.775929 |
54572 | March 20, 2023 | M | 33 | 18-24 | M | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.837374 | -73.941122 |
54573 | March 13, 2023 | Q | 113 | 18-24 | M | WHITE | GRAND LARCENY | 40.680077 | -73.775929 |
54574 | March 17, 2023 | Q | 112 | 65+ | M | ASIAN / PACIFIC ISLANDER | ASSAULT 3 & RELATED OFFENSES | 40.709999 | -73.853973 |
54575 | March 31, 2023 | Q | 114 | 25-44 | M | WHITE | DANGEROUS WEAPONS | 40.7714791 | -73.91823339 |
54576 rows × 9 columns
df2 = df2.rename(columns={
'arrest_boro': 'Borough',
'ofns_desc': 'Offense',
'perp_race': 'Race',
'perp_sex': 'Sex',
'arrest_date':'Date',
'age_group':'Age Group',
'arrest_precinct':'Precinct'
})
df2
Date | Borough | Precinct | Age Group | Sex | Race | Offense | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01T00:00:00.000 | K | 66 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.62572501600005 | -73.99104937999994 |
1 | 2018-01-01T00:00:00.000 | M | 17 | 25-44 | F | BLACK | DANGEROUS DRUGS | 40.752228400000035 | -73.97192485799998 |
2 | 2018-01-01T00:00:00.000 | S | 121 | <18 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.63194388100004 | -74.16170488099993 |
3 | 2018-01-01T00:00:00.000 | B | 40 | 45-64 | M | BLACK HISPANIC | INTOXICATED/IMPAIRED DRIVING | 40.81553235500007 | -73.91861784599996 |
4 | 2018-01-01T00:00:00.000 | B | 46 | 18-24 | M | BLACK HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.84643225400004 | -73.91104764699998 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
947079 | 2022-12-31T00:00:00.000 | M | 7 | 18-24 | M | WHITE | ASSAULT 3 & RELATED OFFENSES | 40.718911 | -73.98939 |
947080 | 2022-12-31T00:00:00.000 | K | 73 | 25-44 | M | BLACK | CRIMINAL MISCHIEF & RELATED OF | 40.666387 | -73.90458 |
947081 | 2022-12-31T00:00:00.000 | M | 32 | 25-44 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.822332 | -73.936708 |
947082 | 2022-12-31T00:00:00.000 | M | 24 | 25-44 | M | WHITE HISPANIC | GRAND LARCENY | 40.794146 | -73.968443 |
947083 | 2022-12-31T00:00:00.000 | M | 24 | 25-44 | M | BLACK | MISCELLANEOUS PENAL LAW | 40.789701 | -73.967972 |
947084 rows × 9 columns
abbr_to_full = {'M': 'Manhattan', 'B': 'Bronx', 'K': 'Brooklyn','Q': 'Queens','S': 'Staten Island'}
df1['Borough'] = df1['Borough'].replace(abbr_to_full)
abbr_to_full = {'M': 'Manhattan', 'B': 'Bronx', 'K': 'Brooklyn','Q': 'Queens','S': 'Staten Island'}
df2['Borough'] = df2['Borough'].replace(abbr_to_full)
df1
Date | Borough | Precinct | Age Group | Sex | Race | Offense | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | February 07, 2023 | Staten Island | 120 | 25-44 | M | BLACK | FELONY ASSAULT | 40.636362 | -74.117505 |
1 | January 04, 2023 | Manhattan | 34 | 25-44 | F | BLACK | (null) | 40.856519 | -73.927564 |
2 | January 10, 2023 | Bronx | 42 | 18-24 | F | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.822355 | -73.911463 |
3 | February 27, 2023 | Queens | 113 | 25-44 | M | BLACK | (null) | 40.66588105 | -73.80183654 |
4 | January 07, 2023 | Manhattan | 18 | <18 | M | BLACK | ROBBERY | 40.76539 | -73.985702 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
54571 | March 20, 2023 | Queens | 113 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.680077 | -73.775929 |
54572 | March 20, 2023 | Manhattan | 33 | 18-24 | M | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.837374 | -73.941122 |
54573 | March 13, 2023 | Queens | 113 | 18-24 | M | WHITE | GRAND LARCENY | 40.680077 | -73.775929 |
54574 | March 17, 2023 | Queens | 112 | 65+ | M | ASIAN / PACIFIC ISLANDER | ASSAULT 3 & RELATED OFFENSES | 40.709999 | -73.853973 |
54575 | March 31, 2023 | Queens | 114 | 25-44 | M | WHITE | DANGEROUS WEAPONS | 40.7714791 | -73.91823339 |
54576 rows × 9 columns
df2
Date | Borough | Precinct | Age Group | Sex | Race | Offense | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01T00:00:00.000 | Brooklyn | 66 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.62572501600005 | -73.99104937999994 |
1 | 2018-01-01T00:00:00.000 | Manhattan | 17 | 25-44 | F | BLACK | DANGEROUS DRUGS | 40.752228400000035 | -73.97192485799998 |
2 | 2018-01-01T00:00:00.000 | Staten Island | 121 | <18 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.63194388100004 | -74.16170488099993 |
3 | 2018-01-01T00:00:00.000 | Bronx | 40 | 45-64 | M | BLACK HISPANIC | INTOXICATED/IMPAIRED DRIVING | 40.81553235500007 | -73.91861784599996 |
4 | 2018-01-01T00:00:00.000 | Bronx | 46 | 18-24 | M | BLACK HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.84643225400004 | -73.91104764699998 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
947079 | 2022-12-31T00:00:00.000 | Manhattan | 7 | 18-24 | M | WHITE | ASSAULT 3 & RELATED OFFENSES | 40.718911 | -73.98939 |
947080 | 2022-12-31T00:00:00.000 | Brooklyn | 73 | 25-44 | M | BLACK | CRIMINAL MISCHIEF & RELATED OF | 40.666387 | -73.90458 |
947081 | 2022-12-31T00:00:00.000 | Manhattan | 32 | 25-44 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.822332 | -73.936708 |
947082 | 2022-12-31T00:00:00.000 | Manhattan | 24 | 25-44 | M | WHITE HISPANIC | GRAND LARCENY | 40.794146 | -73.968443 |
947083 | 2022-12-31T00:00:00.000 | Manhattan | 24 | 25-44 | M | BLACK | MISCELLANEOUS PENAL LAW | 40.789701 | -73.967972 |
947084 rows × 9 columns
df1.dropna()
Date | Borough | Precinct | Age Group | Sex | Race | Offense | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | February 07, 2023 | Staten Island | 120 | 25-44 | M | BLACK | FELONY ASSAULT | 40.636362 | -74.117505 |
1 | January 04, 2023 | Manhattan | 34 | 25-44 | F | BLACK | (null) | 40.856519 | -73.927564 |
2 | January 10, 2023 | Bronx | 42 | 18-24 | F | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.822355 | -73.911463 |
3 | February 27, 2023 | Queens | 113 | 25-44 | M | BLACK | (null) | 40.66588105 | -73.80183654 |
4 | January 07, 2023 | Manhattan | 18 | <18 | M | BLACK | ROBBERY | 40.76539 | -73.985702 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
54571 | March 20, 2023 | Queens | 113 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.680077 | -73.775929 |
54572 | March 20, 2023 | Manhattan | 33 | 18-24 | M | WHITE HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.837374 | -73.941122 |
54573 | March 13, 2023 | Queens | 113 | 18-24 | M | WHITE | GRAND LARCENY | 40.680077 | -73.775929 |
54574 | March 17, 2023 | Queens | 112 | 65+ | M | ASIAN / PACIFIC ISLANDER | ASSAULT 3 & RELATED OFFENSES | 40.709999 | -73.853973 |
54575 | March 31, 2023 | Queens | 114 | 25-44 | M | WHITE | DANGEROUS WEAPONS | 40.7714791 | -73.91823339 |
54576 rows × 9 columns
df2.dropna()
Date | Borough | Precinct | Age Group | Sex | Race | Offense | latitude | longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | 2018-01-01T00:00:00.000 | Brooklyn | 66 | 25-44 | M | ASIAN / PACIFIC ISLANDER | FELONY ASSAULT | 40.62572501600005 | -73.99104937999994 |
1 | 2018-01-01T00:00:00.000 | Manhattan | 17 | 25-44 | F | BLACK | DANGEROUS DRUGS | 40.752228400000035 | -73.97192485799998 |
2 | 2018-01-01T00:00:00.000 | Staten Island | 121 | <18 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.63194388100004 | -74.16170488099993 |
3 | 2018-01-01T00:00:00.000 | Bronx | 40 | 45-64 | M | BLACK HISPANIC | INTOXICATED/IMPAIRED DRIVING | 40.81553235500007 | -73.91861784599996 |
4 | 2018-01-01T00:00:00.000 | Bronx | 46 | 18-24 | M | BLACK HISPANIC | ASSAULT 3 & RELATED OFFENSES | 40.84643225400004 | -73.91104764699998 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
947079 | 2022-12-31T00:00:00.000 | Manhattan | 7 | 18-24 | M | WHITE | ASSAULT 3 & RELATED OFFENSES | 40.718911 | -73.98939 |
947080 | 2022-12-31T00:00:00.000 | Brooklyn | 73 | 25-44 | M | BLACK | CRIMINAL MISCHIEF & RELATED OF | 40.666387 | -73.90458 |
947081 | 2022-12-31T00:00:00.000 | Manhattan | 32 | 25-44 | F | BLACK | ASSAULT 3 & RELATED OFFENSES | 40.822332 | -73.936708 |
947082 | 2022-12-31T00:00:00.000 | Manhattan | 24 | 25-44 | M | WHITE HISPANIC | GRAND LARCENY | 40.794146 | -73.968443 |
947083 | 2022-12-31T00:00:00.000 | Manhattan | 24 | 25-44 | M | BLACK | MISCELLANEOUS PENAL LAW | 40.789701 | -73.967972 |
945860 rows × 9 columns
DATA VISUALIZATION
df2['Date'] = pd.to_datetime(df2['Date'])
df2 = df2.set_index('Date')
weekly_offenses = df2['Offense'].resample('W').count()
plt.figure(figsize=(20, 10))
plt.plot(weekly_offenses)
plt.xlabel('Date')
plt.ylabel('Number of Offenses')
plt.title('Offense Trends')
plt.show()
df1['Offense'].value_counts().sort_values().plot(kind='barh', figsize=(9.5,10), color='Red',grid=True)
plt.xlabel('Frequency')
plt.ylabel('Offense')
plt.title('Count of Offenses by Type')
Text(0.5, 1.0, 'Count of Offenses by Type')
df1.groupby(['Offense', 'Age Group']).size().unstack().plot(kind='bar', stacked=True,figsize=(10,5))
plt.title('Type of Offense by Age Group')
Text(0.5, 1.0, 'Type of Offense by Age Group')
df1.groupby(['Offense', 'Age Group']).size().unstack().plot(kind='barh', stacked=True, figsize=(10, 12))
plt.title('Type of Offense by Age Group')
plt.xlabel('Number of Offenses')
plt.ylabel('Offense')
plt.show()
df_counts = df1.groupby(['Offense', 'Age Group']).size().unstack()
df_counts_sorted = df_counts.sum(axis=1).sort_values(ascending=True) # Change to ascending=True
df_sorted = df_counts.loc[df_counts_sorted.index]
df_sorted.plot(kind='barh', stacked=True, figsize=(10, 11))
plt.title('Type of Offense by Age Group')
plt.xlabel('Number of Offenses')
plt.ylabel('Offense')
plt.show()
df_counts = df1.groupby(['Offense', 'Race']).size().unstack()
df_counts_sorted = df_counts.sum(axis=1).sort_values(ascending=True) # Change to ascending=True
df_sorted = df_counts.loc[df_counts_sorted.index]
df_sorted.plot(kind='barh', stacked=True, figsize=(10, 12))
plt.title('Type of Offense by Race')
plt.xlabel('Number of Offenses')
plt.ylabel('Offense')
plt.show()
df_counts = df1.groupby(['Offense', 'Sex']).size().unstack()
df_counts_sorted = df_counts.sum(axis=1).sort_values(ascending=True) # Change to ascending=True
df_sorted = df_counts.loc[df_counts_sorted.index]
df_sorted.plot(kind='barh', stacked=True, figsize=(10, 11))
plt.title('Type of Offense by Sex')
plt.xlabel('Number of Offenses')
plt.ylabel('Offense')
plt.show()
df_counts = df1.groupby(['Offense', 'Borough']).size().unstack()
df_counts_sorted = df_counts.sum(axis=1).sort_values(ascending=True) # Change to ascending=True
df_sorted = df_counts.loc[df_counts_sorted.index]
df_sorted.plot(kind='barh', stacked=True, figsize=(10, 11))
plt.title('Type of Offense by Borough')
plt.xlabel('Number of Offenses')
plt.ylabel('Offense')
plt.show()
df_counts = df1.groupby(['Borough', 'Race']).size().unstack()
df_counts_sorted = df_counts.sum(axis=1).sort_values(ascending=True) # Change to ascending=True
df_sorted = df_counts.loc[df_counts_sorted.index]
df_sorted.plot(kind='barh', stacked=True, figsize=(12, 8))
plt.title('Number of Offenses by Borough and Race')
plt.xlabel('Number of Offenses')
plt.ylabel('Borough')
plt.show()
df_counts = df1.groupby(['Borough', 'Age Group']).size().unstack()
df_counts_sorted = df_counts.sum(axis=1).sort_values(ascending=True) # Change to ascending=True
df_sorted = df_counts.loc[df_counts_sorted.index]
df_sorted.plot(kind='barh', stacked=True, figsize=(12, 8))
plt.title('Number of Offenses by Borough and Age Group')
plt.xlabel('Number of Offenses')
plt.ylabel('Borough')
plt.show()
# Group the dataframe by precinct and count the number of offenses
df_grouped = df1.groupby(['Precinct', 'latitude', 'longitude']).agg({'Offense': 'count'}).reset_index()
# Create a map centered at New York City
ny_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)
# Create a heat map layer with the latitude, longitude, and offense count
heat_data = [[row['latitude'], row['longitude'], row['Offense']] for index, row in df_grouped.iterrows()]
HeatMap(heat_data).add_to(ny_map)
# Display the map
ny_map
STATISTICAL ANALYSIS
offense_categories = df1['Offense'].unique()
category_dict = {offense_categories[i]: i+1 for i in range(len(offense_categories))}
df1['Offense Category'] = df1['Offense'].map(category_dict)
# Perform describe() on the counts of each category
category_counts = df1['Offense Category'].value_counts()
category_describe = category_counts.describe()
print(category_describe)
count 61.000000 mean 894.688525 std 1577.582270 min 1.000000 25% 16.000000 50% 174.000000 75% 1038.000000 max 7888.000000 Name: Offense Category, dtype: float64
# Create a dictionary to map offense categories to numbers
offense_categories = df1['Offense'].unique()
category_dict = {offense_categories[i]: i+1 for i in range(len(offense_categories))}
df1['Offense Category'] = df1['Offense'].map(category_dict)
# Perform describe() on the counts of each category by age group
age_group_categories = df1.groupby('Age Group')['Offense Category']
age_group_describe = age_group_categories.value_counts().groupby(level=0).describe()
print(age_group_describe)
count mean std min 25% 50% 75% max Age Group 18-24 53.0 178.698113 273.629563 1.0 4.00 44.0 229.00 1336.0 25-44 58.0 542.379310 931.071121 2.0 13.75 109.0 719.25 4554.0 45-64 55.0 195.490909 368.336796 1.0 6.50 36.0 199.50 1764.0 65+ 41.0 19.634146 35.655824 1.0 2.00 5.0 22.00 187.0 <18 34.0 61.470588 118.534290 1.0 3.00 16.0 46.75 620.0
# Create a dictionary to map offense categories to numbers
offense_categories = df1['Offense'].unique()
category_dict = {offense_categories[i]: i+1 for i in range(len(offense_categories))}
df1['Offense Category'] = df1['Offense'].map(category_dict)
# Perform describe() on the counts of each category by sex
sex_categories = df1.groupby('Sex')['Offense Category']
sex_describe = sex_categories.value_counts().groupby(level=0).describe()
print(sex_describe)
count mean std min 25% 50% 75% max Sex F 51.0 175.941176 368.174383 1.0 7.5 22.0 183.5 1987.0 M 61.0 738.885246 1242.842791 1.0 11.0 154.0 860.0 5812.0 U 36.0 14.750000 20.024806 1.0 2.0 5.0 21.0 89.0
# Create a dictionary to map offense categories to numbers
offense_categories = df1['Offense'].unique()
category_dict = {offense_categories[i]: i+1 for i in range(len(offense_categories))}
df1['Offense Category'] = df1['Offense'].map(category_dict)
# Perform describe() on the counts of each category by age group
age_group_categories = df1.groupby('Race')['Offense Category']
age_group_describe = age_group_categories.value_counts().groupby(level=0).describe()
print(age_group_describe)
count mean std min 25% \ Race AMERICAN INDIAN/ALASKAN NATIVE 28.0 6.750000 8.382632 1.0 1.75 ASIAN / PACIFIC ISLANDER 49.0 59.081633 92.045242 1.0 7.00 BLACK 55.0 484.854545 799.600002 2.0 11.00 BLACK HISPANIC 50.0 101.100000 148.104383 1.0 5.00 UNKNOWN 40.0 12.000000 14.609445 1.0 2.00 WHITE 49.0 111.510204 195.416662 1.0 5.00 WHITE HISPANIC 58.0 238.379310 411.845007 1.0 7.25 50% 75% max Race AMERICAN INDIAN/ALASKAN NATIVE 3.0 7.00 36.0 ASIAN / PACIFIC ISLANDER 27.0 57.00 478.0 BLACK 90.0 559.00 3825.0 BLACK HISPANIC 43.5 130.00 717.0 UNKNOWN 7.5 16.25 67.0 WHITE 31.0 106.00 976.0 WHITE HISPANIC 49.5 267.25 2086.0
female_count = df1[df1['Sex'] == 'F'].shape[0]
print("Count of females:", female_count)
Count of females: 8973
male_count = df1[df1['Sex'] == 'M'].shape[0]
print("Count of Males:", male_count)
Count of Males: 45072
race_proportions = df1['Race'].value_counts(normalize=True)
print(race_proportions)
BLACK 0.488621 WHITE HISPANIC 0.253335 WHITE 0.100117 BLACK HISPANIC 0.092623 ASIAN / PACIFIC ISLANDER 0.053045 UNKNOWN 0.008795 AMERICAN INDIAN/ALASKAN NATIVE 0.003463 Name: Race, dtype: float64
race_proportions = df1['Race'].value_counts(normalize=True)
labels = race_proportions.index
sizes = race_proportions.values
fig, ax = plt.subplots(figsize=(10, 8))
ax.pie(sizes, labels=labels, autopct='%1.1f%%')
ax.axis('equal')
ax.set_title('Race Proportions')
plt.show()
age_proportions = df1['Age Group'].value_counts(normalize=True)
print(age_proportions)
25-44 0.576407 45-64 0.197010 18-24 0.173538 <18 0.038295 65+ 0.014750 Name: Age Group, dtype: float64
age_proportions = df1['Age Group'].value_counts(normalize=True)
labels = age_proportions.index
sizes = age_proportions.values
fig, ax = plt.subplots(figsize=(10, 8))
ax.pie(sizes, labels=labels, autopct='%1.1f%%')
ax.axis('equal')
ax.set_title('Age Group Proportions')
plt.show()
SQL DATABASE EXPORT
import sqlite3
# create a connection to the database
conn = sqlite3.connect('C:/path/to/mock/database.db')
# export the df1 DataFrame to the database
df1.to_sql('NYC_Arrest_Data', conn, if_exists='replace', index=False)
# close the database connection
conn.close()