What this notebook does

  1. Gathers building data and housing code violation data from NYC open data
  2. Pulls out the violations related to fire codes
  3. Maps the violations to community district boundaries from Census data for the choropleth map
  4. Groups the violations by building type for the second graphic in the story
import psutil
memory = psutil.virtual_memory()
print(f" {'*' * 3} Memory used percentage - {memory.percent} {'*' * 4} Free Memory available - { round(memory.free / (1024.0 ** 3))} GB")
 *** Memory used percentage - 96.1 **** Free Memory available - 0 GB
import pandas as pd
import numpy as np

buildings = pd.read_csv('latest_buildings_polygon_sanitised.csv')
plutos = pd.read_csv('latest_pluto_sanitised.csv')
housing_violations = pd.read_csv('Housing_Maintenance_Code_Violations.csv')
list(plutos.columns)
['Unnamed: 0',
 'bbl',
 'bct2020',
 'firecomp',
 'overlay1',
 'overlay2',
 'ltdheight',
 'bldgclass',
 'landuse',
 'ownertype',
 'ownername',
 'bldgarea',
 'numbldgs',
 'numfloors',
 'unitstotal',
 'proxcode',
 'yearbuilt',
 'yearalter1',
 'yearalter2',
 'edesignum',
 'latitude',
 'longitude']
list(housing_violations.columns)
['ViolationID',
 'BuildingID',
 'RegistrationID',
 'BoroID',
 'Borough',
 'HouseNumber',
 'LowHouseNumber',
 'HighHouseNumber',
 'StreetName',
 'StreetCode',
 'Postcode',
 'Apartment',
 'Story',
 'Block',
 'Lot',
 'Class',
 'InspectionDate',
 'ApprovedDate',
 'OriginalCertifyByDate',
 'OriginalCorrectByDate',
 'NewCertifyByDate',
 'NewCorrectByDate',
 'CertifiedDate',
 'OrderNumber',
 'NOVID',
 'NOVDescription',
 'NOVIssuedDate',
 'CurrentStatusID',
 'CurrentStatus',
 'CurrentStatusDate',
 'NovType',
 'ViolationStatus',
 'RentImpairing',
 'Latitude',
 'Longitude',
 'CommunityBoard',
 'CouncilDistrict',
 'CensusTract',
 'BIN',
 'BBL',
 'NTA']
housing_violations = housing_violations_og[[
'Borough',
'ViolationID',
 'BuildingID',
 'Class',
 'InspectionDate',
 'ApprovedDate',
 'OriginalCertifyByDate',
 'OriginalCorrectByDate',
 'NewCertifyByDate',
 'NewCorrectByDate',
 'CertifiedDate',
'OrderNumber',
 'NOVDescription',
 'NOVIssuedDate',
 'CurrentStatusID',
 'CurrentStatus',
 'CurrentStatusDate',
 'NovType',
 'ViolationStatus',
 'Latitude',
 'Longitude',
 'CensusTract',
'CommunityBoard',
 'BIN',
]]
housing_violations = housing_violations_og
orders = pd.read_csv('order descs.csv')
orders.shape
(387, 2)
orders['Order No.'] = orders['Order No.'].astype(str)
housing_violations['OrderNumber'] = housing_violations['OrderNumber'].astype(str)
mdf = housing_violations.merge(orders, how="left", left_on="OrderNumber", right_on="Order No.")
mdf[['OrderNumber','Order No.']].head()

mdf = mdf[mdf['OrderNumber'] == mdf['Order No.']]
mdf['BIN'].nunique()
151202
list(mdf.columns) 
['ViolationID',
 'BuildingID',
 'RegistrationID',
 'BoroID',
 'Borough',
 'HouseNumber',
 'LowHouseNumber',
 'HighHouseNumber',
 'StreetName',
 'StreetCode',
 'Postcode',
 'Apartment',
 'Story',
 'Block',
 'Lot',
 'Class',
 'InspectionDate',
 'ApprovedDate',
 'OriginalCertifyByDate',
 'OriginalCorrectByDate',
 'NewCertifyByDate',
 'NewCorrectByDate',
 'CertifiedDate',
 'OrderNumber',
 'NOVID',
 'NOVDescription',
 'NOVIssuedDate',
 'CurrentStatusID',
 'CurrentStatus',
 'CurrentStatusDate',
 'NovType',
 'ViolationStatus',
 'RentImpairing',
 'Latitude',
 'Longitude',
 'CommunityBoard',
 'CouncilDistrict',
 'CensusTract',
 'BIN',
 'BBL',
 'NTA',
 'Order No.',
 'Order Number Description']
mdf['Order Number Description']
0          § 27-2005 ADMIN.\n  CODE: REPAIR THE BROKEN OR...
1          § 27-2026 ADM\n  CODE REPAIR THE LEAKY AND/OR ...
2          § 27-2018 \n  ADMIN. CODE: ABATE THE INFESTATI...
3          § 27-2026 ADM\n  CODE REPAIR THE LEAKY AND/OR ...
4          § 27-2026 ADM\n  CODE REPAIR THE LEAKY AND/OR ...
                                 ...                        
2679743    § 27-2005, 2007\n  ADM CODE ARRANGE AND MAKE S...
2679744    § 27-2056.6 ADM\n  CODE - CORRECT THE LEAD-BAS...
2679745    § 27-2056.6 ADM\n  CODE - CORRECT THE LEAD-BAS...
2679746    § 27-2056.6 ADM\n  CODE - CORRECT THE LEAD-BAS...
2679747    § 27-2013 ADMIN.\n  CODE: PAINT WITH LIGHT COL...
Name: Order Number Description, Length: 2455575, dtype: object
substr = "FIRE|FIREPROOF|SMOKE|STOVE"
cleaned_df = mdf[mdf['Order Number Description'].str.contains(substr)]
cleaned_df['Order Number Description'].unique()
array(['§ 27-2045 ADM\n  CODE REPAIR OR REPLACE THE SMOKE DETECTOR',
       '§ 27-2045 ADM\n  CODE POST A PROPER NOTICE OF SMOKE DETECTOR REQUIREMENTS, IN A FORM\n  APPROVED BY THE COMMISSIONER, AT OR NEAR THE MAIL BOX',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. REPLACE WITH NEW THE BROKEN, DEFECTIVE\n  AND/OR MISSING',
       '§ 27-2005, 2007\n  ADM CODE AND DEPT. RULES AND REGULATIONS. REMOVE THE ENCUMBRANCE\n  OBSTRUCTING EGRESS FROM FIRE ESCAPES',
       '§ 27-2005, 2007\n  ADM CODE FIRE EGRESS DEFECTIVE. REMOVE OBSTRUCTING BARS OR UNLAWFUL\n  GATES FROM WINDOW TO FIRE ESCAPE OR PROVIDE APPROVED TYPE GATE',
       '§ 27-2005, 2007\n  ADM CODE FIRE EGRESS DEFECTIVE. REMOVE OBSTRUCTING BARS OR UNLAWFUL\n  GATES FROM AT LEAST 1 WINDOW OR PROVIDE APPROVED TYPE GATE',
       '§ 27-2005, 2007\n  ADM CODE FIRE EGRESS DEFECTIVE. REMOVE THE OBSTRUCTION IN FIREPROOF\n  PASSAGEWAY',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. ADJUST THE DROPLADDER SO THAT IT WILL SLIDE\n  EASILY IN THE GUIDE RODS',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. PROPERLY SECURE',
       '§ 27-2005, 2007\n  ADM CODE FIRE EGRESS DEFECTIVE. RESTORE EGRESS BY REMOVING LOCKING\n  DEVICE AND ARRANGE TO OPEN READILY THE GATE IN FENCE AT LOT LINE',
       '§ 27-2005, 2007\n  ADM CODE FIRE EGRESS DEFECTIVE. MAKE STATIONARY THE IRON LADDER LEADING\n  TO SCUTTLE OPENING IN ROOF',
       '§ 27-2005,2007\n  ADMON. CODE: FIRE EGRESS DEFECTIVE. REMOVE OBSTRUCTING BARS OR GATES\n  FROM AT LEAST 1 WINDOW.',
       '§ 27-2045, 2046\n  ADM CODE PROVIDE AN APPROVED AND OPERATIONAL SMOKE DETECTING DEVICE,\n  INSTALLED IN ACCORDANCE WITH DEPARTMENT OF BUILDINGS RULES AND REGULATIONS',
       '§ 185, 240 \n  M/D LAW PROPERLY FIRE RETARD IN ACCORDANCE WITH THE RULES AND REGULATIONS OF\n  THIS DEPARTMENT THE CELLAR CEILING',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. RESET THE BRACES OF THE BRACKETS SO THAT\n  THEY SHALL BEAR AGAINST WALL',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. FILL IN HOLES IN WALL AROUND BRACKETS\n  AND/OR RAILS',
       '§ 27-2044 ADM\n  CODE - REPAIR OR REPLACE SMOKE DETECTOR',
       '§ 62 M/D\n  LAW REARRANGE TELEVISION AND/OR RADIO ANTENNA TO BE AT LEAST 10 FEET\n  ABOVE ROOF AND NOT ATTACHED TO ANY FIRE ESCAPE OR SOIL OR VENT LINE',
       '§ 27-2005, 2044\n  ADM CODE PROVIDE APPROVED ONE-HOUR FIRE RESISTANCE RATED SELF-CLOSING\n  DOOR WITH KEY-OPERATED DEAD BOLT & LATCH SET, PEEP HOLE & CHAIN GUARD\n  AT MAIN ENTRANCE TO',
       '§ 27-2046.4 HMC:\n  PROVIDE CONTROL KNOB SAFETY COVERS FOR THE GAS STOVE.',
       '§ 27-2045, 2046\n  ADM CODE FILE CERTIFICATION OF SATISFACTORY INSTALLATION OF SMOKE\n  DETECTING DEVICE IN ACCORDANCE WITH H.P.D. RULES AND REGULATIONS.',
       '§ 27-2005 ADMIN.\n  CODE: FIRE ESCAPE DEFECTIVE. PROPERLY SECURE',
       '§ 27-2005 ADMIN.\n  CODE: FIRE ESCAPE DEFECTIVE. ADJUST THE DROPLADDER SO THAT IT WILL\n  SLIDE EASILY IN THE GUIDE RODS',
       '§ 53,187,231 M/D\n  LAW & DEPT RULES & REG. PROVIDE MEANS OF EGRESS FROM YARD TO STREET\n  BY FIREPROOF PASSAGEWAY FOR WHICH APPLICATION MUST BE FILED FOR APPROVAL,OR\n  BY UNLOCKED DOOR OR GATE IN FENCE TO ADJOINING PREMISES WITH CONSENT OF\n  ADJACENT OWNER.',
       '§ 27-2005 ADMIN.\n  CODE: FIRE ESCAPE DEFECTIVE. REPLACE WITH NEW THE BROKEN, DEFECTIVE\n  AND/OR MISSING',
       '§ 27-2005,2007\n  ADMIN. CODE: FIRE EGRESS DEFECTIVE. REMOVE OBSTRUCTING BARS OR GATES\n  FROM WINDOW TO FIRE ESCAPE',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. REPLACE WITH 2 X 1/2 INCH IRON BAR\n  EXTENDING ACROSS AND SECURED TO 2 BRACKETS, THE BROKEN OR DEFECTIVE ANGLE\n  IRON OR BAR AT FOOT OF GOOSENECK LADDER',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. REMOVE AND REPLACE WITH NEW THE BROKEN AND\n  DEFECTIVE BRACKET AND LEAVE ALL ANCHORAGES UNCOVERED FOR INSPECTION',
       '§ 27-2005,2007\n  ADMIN. CODE AND DEPARTMENT RULES AND REGULATIONS: REMOVE THE ENCUMBRANCE\n  OBSTRUCTING EGRESS FROM FIRE ESCAPES',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. REMOVE OR MAKE SAFE THE ADDITIONAL\n  FIRE ESCAPES',
       '§ 53, 187, 231\n  M/D LAW AND DEPARTMENT RULES AND REGULATIONS. PROVIDE A FIREPROOF\n  PASSAGEWAY FROM FIRE ESCAPES AT REAR TO STREET AND FILE APPLICATION FOR\n  APPROVAL BEFORE DOING THIS WORK.',
       '§ 27-2045 ADM\n  CODE - PROVIDE AN APPROVED AND OPERATIONAL SMOKE DETECTING DEVICE, INSTALLED\n  IN ACCORDANCE WITH DEPARTMENT OF BUILDINGS RULES AND REGULATIONS',
       '§ 27-2044 ADM\n  CODE REMOVE UNLAWFUL SASH IN PARTITION SEPARATING APARTMENT FROM PUBLIC\n  HALL AND SEAL OPENING WITH FIRE RETARDED MATERIAL',
       '§ 27-2045 ADM\n  CODE - POST A PROPER NOTICE OF SMOKE DETECTOR REQUIREMENTS, IN A FORM\n  APPROVED BY THE COMMISSIONER , AT OR NEAR THE MAILBOX',
       '§ 27-2005 ADM\n  CODE FIRE ESCAPE DEFECTIVE. REMOVE THE ADDITIONAL, DANGEROUS,\n  INADEQUATE FIRE ESCAPE',
       '§ 27-2045 ADM\n  CODE - FILE CERTIFICATION OF SATISFACTORY INSTALLATION OF SMOKE DETECTING\n  DEVICE IN ACCORDANCE WITH HPD RULES AND REGULATIONS',
       '§ 27-2045, 2046\n  ADM CODE MAINTAIN ON THE PREMISES AND MAKE AVAILABLE TO THE DEPARTMENT\n  UPON REQUEST RECORDS OF INSTALLATION AND MAINTENANCE OF SMOKE DETECTORS IN\n  THE BUILDING.',
       '§ 27-2044 ADM\n  CODE PROPERLY FIRE-RETARD, IN ACCORDANCE WITH THE RULES AND REGULATIONS\n  OF THIS DEPARTMENT, THE CEILING',
       '§ 27-2044 ADM\n  CODE PROVIDE APPROVED ONE-HOUR FIRE RESISTANCE RATED SELF-CLOSING DOOR\n  AT ALL OPENINGS TO PUBLIC HALL OTHER THAN APT MAIN ENTRANCE DOOR FROM'],
      dtype=object)
final_df = cleaned_df[cleaned_df['ViolationStatus']=='Open']
final_df['BIN'].nunique()
19043
len(final_df[final_df['InspectionDate'].str.contains("2022")])
17664
# final_df.to_csv('final_housingvio_fire_2019_open.csv')

cleaned_df.to_csv('cleaned_housingvio_fire_2019.csv')
plutos.nunique()
Unnamed: 0    858619
bbl           858619
bct2020         2322
firecomp         348
overlay1          10
overlay2          10
ltdheight          2
bldgclass        214
landuse           11
ownertype          5
ownername     731377
bldgarea       38517
numbldgs         138
numfloors        175
unitstotal       791
proxcode           4
yearbuilt        252
yearalter1       121
yearalter2        74
edesignum        706
latitude      739433
longitude     765538
dtype: int64
buildings.nunique()
Unnamed: 0    1084656
the_geom      1084656
NAME             1563
BIN           1084646
CNSTRCT_YR        249
LSTMODDATE       1455
LSTSTATYPE         12
HEIGHTROOF     239620
MPLUTO_BBL     817420
dtype: int64
merged_df = buildings.merge(plutos, left_on='MPLUTO_BBL', right_on='bbl', how='left')
merged_df.shape
(1084656, 31)
merged_df['ownertype'].isnull().sum()
1050300
merged_df.head()['MPLUTO_BBL']
0    3065220021
1    5012640036
2    5060190091
3    3086910048
4    4075020005
Name: MPLUTO_BBL, dtype: int64
list(merged_df.columns)
['Unnamed: 0_x',
 'the_geom',
 'NAME',
 'BIN',
 'CNSTRCT_YR',
 'LSTMODDATE',
 'LSTSTATYPE',
 'HEIGHTROOF',
 'MPLUTO_BBL',
 'Unnamed: 0_y',
 'bbl',
 'bct2020',
 'firecomp',
 'overlay1',
 'overlay2',
 'ltdheight',
 'bldgclass',
 'landuse',
 'ownertype',
 'ownername',
 'bldgarea',
 'numbldgs',
 'numfloors',
 'unitstotal',
 'proxcode',
 'yearbuilt',
 'yearalter1',
 'yearalter2',
 'edesignum',
 'latitude',
 'longitude']
merged_df[['bbl', 'MPLUTO_BBL']].head()
bbl MPLUTO_BBL
0 3.065220e+09 3065220021
1 5.012640e+09 5012640036
2 5.060190e+09 5060190091
3 3.086910e+09 3086910048
4 4.075020e+09 4075020005
merged_df = merged_df[['Unnamed: 0_x',
 'the_geom',
 'BIN',
 'bbl',
 'bct2020',
 'bldgclass',
 'landuse',
 'ownertype',
'ownername',
 'unitstotal',
 'latitude',
 'longitude']]
merged_df.to_csv('final_cleaned_df.csv')
type(merged_df['BIN'][0])
numpy.int64
type(relevant_bins.astype(int)[0])
numpy.int64
relevant_bins = pd.Series(pd.read_csv('bins.csv', header=None)[0])
relevant_df = merged_df[merged_df['BIN'].isin(relevant_bins)]
relevant_df.shape
# relevant_df['ownertype'].count()
(15383, 31)
relevant_bins = pd.Series(pd.read_csv('bins.csv', header=None)[0])
relevant_df = new_merged_df[new_merged_df['BIN'].isin(relevant_bins)]
relevant_df.shape
(15348, 31)
merged_df = pd.read_csv('cleaned_df.csv')
# all_plutos = merged_df
# relevant_bins = pd.Series(pd.read_csv('../../../tejalwakchoure.github.io/chart-data/bins.csv', header=None)[0])
merged_df = merged_df[(merged_df['yearalter2'] > 2005) | (merged_df['yearalter1'] > 2005)]
relevant_df = merged_df[merged_df['BIN'].isin(relevant_bins)]

relevant_df['yearalter2'] = np.where(relevant_df['yearalter2'].isnull(), relevant_df['yearalter1'], relevant_df['yearalter2'])
grouped_rel_df = relevant_df.groupby('yearalter2').count().reset_index()
grouped_rel_df = grouped_rel_df[grouped_rel_df['yearalter2'] > 1700]
grouped_rel_df['yearalter2'] = grouped_rel_df['yearalter2'].dropna().astype(int)
grouped_rel_df[['yearalter2', 'BIN']].sort_values('BIN', ascending=False)
/var/folders/lc/x87wxbbx5w192w4t4j8hx0w00000gn/T/ipykernel_50674/3973979299.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
yearalter2 BIN
27 2008 333
28 2009 293
25 2006 276
31 2012 267
33 2014 257
34 2015 249
32 2013 249
26 2007 234
35 2016 218
29 2010 216
36 2017 214
30 2011 202
37 2018 143
38 2019 141
39 2020 110
24 2005 28
40 2021 21
20 2001 21
9 1989 20
23 2004 18
15 1996 14
22 2003 13
17 1998 11
19 2000 11
14 1995 10
16 1997 8
41 2022 8
21 2002 7
11 1992 7
18 1999 6
13 1994 5
12 1993 4
8 1988 4
6 1986 3
7 1987 2
4 1983 2
2 1962 1
10 1991 1
5 1985 1
3 1982 1
1 1930 1
grouped_rel_df = merged_df
grouped_rel_df['yearalter2']= np.where(grouped_rel_df['yearalter2'].isnull(), grouped_rel_df['yearalter1'], grouped_rel_df['yearalter2'])

grouped_rel_df = grouped_rel_df.groupby('yearalter2').count().reset_index()
grouped_rel_df = grouped_rel_df[grouped_rel_df['yearalter2'] > 1700]
grouped_rel_df['yearalter2'] = grouped_rel_df['yearalter2'].dropna().astype(int)
grouped_rel_df[['yearalter2', 'BIN']].sort_values('BIN', ascending=False)
# fig = px.line(grouped_rel_df, y='BIN', x='yearalter1')
# fig.show()
yearalter2 BIN
31 2008 1541
32 2009 1305
30 2007 1076
38 2015 1014
29 2006 1002
37 2014 999
39 2016 852
35 2012 813
33 2010 808
40 2017 797
36 2013 784
42 2019 637
34 2011 626
41 2018 606
43 2020 391
44 2021 129
45 2022 83
28 2005 48
26 2003 37
24 2001 36
27 2004 35
19 1996 28
13 1989 26
23 2000 23
21 1998 20
25 2002 18
15 1992 15
18 1995 15
17 1994 14
22 1999 12
20 1997 12
12 1988 6
16 1993 5
10 1986 4
11 1987 3
14 1991 2
7 1983 2
1 1930 2
2 1950 1
9 1985 1
8 1984 1
6 1982 1
5 1979 1
4 1970 1
3 1962 1
list(merged_df.columns)
['Unnamed: 0',
 'the_geom',
 'NAME',
 'BIN',
 'HEIGHTROOF',
 'MPLUTO_BBL',
 'bbl',
 'bct2020',
 'bldgclass',
 'landuse',
 'ownertype',
 'bldgarea',
 'numbldgs',
 'unitstotal',
 'yearbuilt',
 'yearalter1',
 'yearalter2',
 'latitude',
 'longitude']
import plotly.express as px
import plotly.graph_objects as go

def min_max_scaling(series):
    return round((series - series.min()) * 100/ (series.max() - series.min()))

grouped_rel_df = merged_df.groupby('yearalter2').count().reset_index()
grouped_rel_df['yearalter2']= np.where(grouped_rel_df['yearalter2'].isnull(), grouped_rel_df['yearalter1'], grouped_rel_df['yearalter2'])
grouped_rel_df = grouped_rel_df[grouped_rel_df['yearalter2'] > 1700]
grouped_rel_df['yearalter2'] = grouped_rel_df['yearalter2'].dropna().astype(int)

grouped_rel_df['BIN'] = min_max_scaling(grouped_rel_df['BIN'])
fig2 = px.line(grouped_rel_df, y='BIN', x='yearalter2')

grouped_rel_df = all_plutos.groupby('yearalter2').count().reset_index()
# grouped_rel_df['yearalter2']= np.where(grouped_rel_df['yearalter2'].isnull(), grouped_rel_df['yearalter1'], grouped_rel_df['yearalter2'])
# grouped_rel_df = grouped_rel_df[grouped_rel_df['yearalter2'] > 1700]
grouped_rel_df['yearalter2'] = grouped_rel_df['yearalter2'].dropna().astype(int)

grouped_rel_df['BIN'] = min_max_scaling(grouped_rel_df['BIN'])
fig1 = px.line(grouped_rel_df, y='BIN', x='yearalter2')

fig2.data[0].line.color = "pink"
fig = go.Figure(data = fig1.data + fig2.data)
fig.show()
buildings['BIN'].dropna().nunique()
1084646
newdf = incidents[[    
    
 'INCIDENT_DATETIME',
 'ALARM_BOX_BOROUGH',
 'ALARM_BOX_NUMBER',
 'ALARM_BOX_LOCATION',
 
 'INCIDENT_CLASSIFICATION',
 'INCIDENT_CLASSIFICATION_GROUP',
    
 'VALID_DISPATCH_RSPNS_TIME_INDC',
 'VALID_INCIDENT_RSPNS_TIME_INDC',
 'INCIDENT_RESPONSE_SECONDS_QY'
                    ]]
newdf.shape
(8691513, 9)
# problematic!!

year_start = pd.to_datetime(2013,format='%Y')
newdf['INCIDENT_DATETIME'] =  pd.to_datetime(newdf['INCIDENT_DATETIME'])
newdf = newdf[newdf['INCIDENT_DATETIME'] >= year_start]
newdf_y = newdf[newdf['INCIDENT_CLASSIFICATION_GROUP'] == 'Structural Fires']
newdf.shape
INCIDENT_DATETIME INCIDENT_BOROUGH ALARM_BOX_BOROUGH ALARM_BOX_NUMBER ALARM_BOX_LOCATION VALID_DISPATCH_RSPNS_TIME_INDC VALID_INCIDENT_RSPNS_TIME_INDC
4511365 2013-01-01 00:00:20 BRONX BRONX 2147 ST ANN'S AVE & E138 ST N N
4511366 2013-01-01 00:00:37 MANHATTAN MANHATTAN 818 11 AVE & 46 ST N Y
4511367 2013-01-01 00:01:17 QUEENS QUEENS 9656 JAMAICA AVE & 117 ST N Y
4511368 2013-01-01 00:01:49 QUEENS QUEENS 4019 WYCKOFF AVE & DECATUR ST N Y
4511369 2013-01-01 00:02:32 QUEENS QUEENS 7412 BROADWAY & 44 ST N Y
newdf_y = newdf_y[newdf_y['INCIDENT_RESPONSE_SECONDS_QY'] > 0] 
# then delete this col, see if you need the 'valid' cols
newdf_y.shape

# newdf_y.to_csv('fire_incidents.csv')
# test = plutos_og[plutos_og['bbl'].isin(buildings_og['MPLUTO_BBL'])]
test = rel_buildings_og[buildings_og['MPLUTO_BBL'].isin(plutos_og['bbl'])]
test.shape
(66911, 8)
# rel_buildings_og['MPLUTO_BBL'].nunique()
# plutos_og['bbl'].nunique()
rel_buildings = buildings[buildings['BIN'].isin(relevant_bins)]
rel_buildings.nunique()
Unnamed: 0    67022
the_geom      67022
NAME            857
BIN           67022
CNSTRCT_YR      222
LSTMODDATE      731
LSTSTATYPE       10
HEIGHTROOF    24334
MPLUTO_BBL    62124
dtype: int64
list(plutos.columns)
['Unnamed: 0',
 'bbl',
 'bct2020',
 'firecomp',
 'overlay1',
 'overlay2',
 'ltdheight',
 'bldgclass',
 'landuse',
 'ownertype',
 'ownername',
 'bldgarea',
 'numbldgs',
 'numfloors',
 'unitstotal',
 'proxcode',
 'yearbuilt',
 'yearalter1',
 'yearalter2',
 'edesignum',
 'latitude',
 'longitude']
# plutos['yearbuilt'].mean()
plutos['ownertype'].unique()
array([nan, 'X', 'C', 'O', 'P', 'M'], dtype=object)
import pandas as pd
merged_df = pd.read_csv('cleaned_df.csv')
# relevant_bins = pd.Series(pd.read_csv('../../../tejalwakchoure.github.io/chart-data/bins.csv', header=None)[0])
# relevant_df = merged_df[merged_df['BIN'].isin(relevant_bins)]
# relevant_df.groupby(['landuse']).count()
# relevant_df.groupby(['landuse']).count()[['Unnamed: 0_x']]
relevant_df['landuse'].count()
relevant_df.groupby('yearalter2').count()
Unnamed: 0 the_geom NAME BIN HEIGHTROOF MPLUTO_BBL bbl bct2020 bldgclass landuse ownertype bldgarea numbldgs unitstotal yearbuilt yearalter1 latitude longitude
yearalter2
0.0 61374 61374 734 61374 61263 61374 61374 61349 61374 61295 10449 61374 61374 61372 61374 61374 61349 61349
1930.0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1931.0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
1941.0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
1954.0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2018.0 143 143 4 143 143 143 143 143 143 143 33 143 143 143 143 143 143 143
2019.0 141 141 3 141 141 141 141 140 141 141 18 141 141 141 141 141 140 140
2020.0 110 110 2 110 110 110 110 110 110 110 20 110 110 110 110 110 110 110
2021.0 21 21 0 21 21 21 21 20 21 21 4 21 21 21 21 21 20 20
2022.0 8 8 0 8 8 8 8 8 8 8 1 8 8 8 8 8 8 8

69 rows × 18 columns

import numpy as np
import plotly.express as px
import plotly.graph_objects as go

def min_max_scaling(series):
    return round((series - series.min()) * 100/ (series.max() - series.min()))

grouped_rel_df = merged_df.groupby('yearalter2').count().reset_index()
grouped_rel_df['yearalter2']= np.where(grouped_rel_df['yearalter2'].isnull(), grouped_rel_df['yearalter1'], grouped_rel_df['yearalter2'])
grouped_rel_df = grouped_rel_df[grouped_rel_df['yearalter2'] > 1700]
grouped_rel_df['yearalter2'] = grouped_rel_df['yearalter2'].dropna().astype(int)

# dd = grouped_rel_df[['yearalter2','BIN']]
# datedif_df = grouped_rel_df[['yearalter2','BIN']]
# datedif_df.rename(columns={'BIN': 'BIN_all'}, inplace=True)

grouped_rel_df['BIN'] = min_max_scaling(grouped_rel_df['BIN'])
fig2 = px.line(grouped_rel_df, y='BIN', x='yearalter2')

grouped_rel_df = relevant_df.groupby('yearalter2').count().reset_index()
grouped_rel_df['yearalter2']= np.where(grouped_rel_df['yearalter2'].isnull(), grouped_rel_df['yearalter1'], grouped_rel_df['yearalter2'])
grouped_rel_df = grouped_rel_df[grouped_rel_df['yearalter2'] > 1700]
grouped_rel_df['yearalter2'] = grouped_rel_df['yearalter2'].dropna().astype(int)

# datedif_df_new = grouped_rel_df[['yearalter2','BIN']]
# datedif_df_new.rename(columns={'BIN': 'BIN_relevant'}, inplace=True)
# datedif_df.merge(datedif_df_new, on='yearalter2', how='left')

# dd['BIN'] = dd['BIN'] - grouped_rel_df['BIN']

grouped_rel_df['BIN'] = min_max_scaling(grouped_rel_df['BIN'])
fig1 = px.line(grouped_rel_df, y='BIN', x='yearalter2')

fig2.data[0].line.color = "pink"
fig = go.Figure(data = fig1.data + fig2.data)


fig.show()
test = relevant_df[relevant_df['yearalter2'] == 2005].groupby('landuse').count().reset_index() #.sort_values('BIN', ascending=False)
test['BIN'] = min_max_scaling(test['BIN'])
fig1 = px.line(test, x='landuse', y='BIN')
fig1.update_traces(marker_color='skyblue')
fig1.data[0].line.color = "skyblue"

test = merged_df[merged_df['yearalter2'] == 2005].groupby('landuse').count().reset_index() #.sort_values('BIN', ascending=False)
test['BIN'] = min_max_scaling(test['BIN'])
fig2 = px.line(test, x='landuse', y='BIN')

fig = go.Figure(data = fig1.data + fig2.data)
fig.show()
merged_df[merged_df['yearalter2'] == 2005].groupby('landuse').count().reset_index().sort_values('BIN', ascending=False)
Unnamed: 0 the_geom NAME BIN HEIGHTROOF MPLUTO_BBL bbl bct2020 bldgclass ownertype bldgarea numbldgs unitstotal yearbuilt yearalter1 yearalter2 latitude longitude
landuse
1.0 384 384 0 384 384 384 384 384 384 0 384 384 384 384 384 384 384 384
4.0 185 185 0 185 185 185 185 185 185 4 185 185 185 185 185 185 185 185
2.0 144 144 0 144 144 144 144 144 144 9 144 144 144 144 144 144 144 144
5.0 127 127 4 127 127 127 127 127 127 5 127 127 127 127 127 127 127 127
3.0 43 43 0 43 43 43 43 43 43 9 43 43 43 43 43 43 43 43
8.0 43 43 0 43 43 43 43 43 43 38 43 43 43 43 43 43 43 43
6.0 19 19 0 19 19 19 19 19 19 5 19 19 19 19 19 19 19 19
7.0 11 11 0 11 11 11 11 11 11 0 11 11 11 11 11 11 11 11
9.0 5 5 0 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
10.0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
merged_df.groupby('landuse').count()
Unnamed: 0 the_geom NAME BIN HEIGHTROOF MPLUTO_BBL bbl bct2020 bldgclass ownertype bldgarea numbldgs unitstotal yearbuilt yearalter1 yearalter2 latitude longitude
landuse
1.0 755325 755325 6 755325 754300 755325 755325 755304 755325 2061 755325 755325 755325 755325 755325 755325 755304 755304
2.0 174846 174846 9 174846 174510 174846 174846 174821 174846 4032 174846 174846 174846 174846 174846 174846 174821 174821
3.0 17095 17095 358 17095 16939 17095 17095 17077 17095 4245 17095 17095 17094 17095 17095 17095 17077 17077
4.0 64206 64206 134 64206 63917 64206 64206 64169 64206 1343 64206 64206 64206 64206 64206 64206 64169 64169
5.0 23654 23654 364 23654 23501 23654 23654 23646 23654 946 23654 23654 23654 23654 23654 23654 23646 23646
6.0 11762 11762 6 11762 11705 11762 11762 11756 11762 509 11762 11762 11762 11762 11762 11762 11756 11756
7.0 7440 7440 209 7440 7198 7440 7440 7436 7440 3424 7440 7440 7440 7440 7440 7440 7436 7436
8.0 16941 16941 445 16941 16866 16941 16941 16931 16941 14129 16941 16941 16941 16941 16941 16941 16931 16931
9.0 2875 2875 365 2875 2754 2875 2875 2875 2875 2543 2875 2875 2875 2875 2875 2875 2875 2875
10.0 4668 4668 7 4668 4607 4668 4668 4661 4668 294 4668 4668 4668 4668 4668 4668 4665 4665
11.0 2913 2913 7 2913 2826 2913 2913 2901 2913 521 2913 2913 2913 2913 2913 2913 2901 2901
df = merged_df
reldf = relevant_df
def min_max_scaling(series):
    return (series - series.min()) / (series.max() - series.min())
for col in ('HEIGHTROOF', 'unitstotal'):
    df[col] = min_max_scaling(df[col])
for col in ('HEIGHTROOF', 'unitstotal'):
    reldf[col] = min_max_scaling(reldf[col])
/var/folders/lc/x87wxbbx5w192w4t4j8hx0w00000gn/T/ipykernel_50674/3797715576.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# merged_df.groupby(['ownertype']).count()
import numpy as np

print(merged_df['unitstotal'].mean()*1000, relevant_df['unitstotal'].mean()*1000)
print(merged_df['HEIGHTROOF'].mean()*1000, relevant_df['HEIGHTROOF'].mean()*1000)

merged_df['norm_heights'].sort_values().dropna().dropinf()
# merged_df = merged_df.assign(norm_heights= lambda x: merged_df['HEIGHTROOF'].dropna().astype(int)/merged_df['unitstotal'].dropna().astype(int))
# relevant_df = relevant_df.assign(norm_heights= lambda x: relevant_df['HEIGHTROOF'].dropna().astype(int)/relevant_df['unitstotal'].dropna().astype(int))

# merged_df['norm_heights'].replace([np.inf, - np.inf], np.nan, inplace = True)
# merged_df = merged_df.dropna()
# relevant_df['norm_heights'].replace([np.inf, - np.inf], np.nan, inplace = True)
# relevant_df = relevant_df.dropna()

# print(merged_df['norm_heights'].mean(), relevant_df['norm_heights'].mean())

# merged_df['unitstotal'].median()
# relevant_df['unitstotal'].median()
0.9017254465668113 7.018855585114436
17.824540447631517 42.13211848663473
import plotly.express as px
import plotly.graph_objects as go



label = 'yearbuilt'
grouped_rel_df = relevant_df
# grouped_rel_df['yearalter2']= np.where(grouped_rel_df['yearalter2'].isnull(), grouped_rel_df['yearalter1'], grouped_rel_df['yearalter2'])
grouped_rel_df = grouped_rel_df[grouped_rel_df[label]>1700]
grouped_rel_df[label] = grouped_rel_df[label].dropna().astype(int)
grouped_rel_df = grouped_rel_df.groupby(label).count().reset_index()
grouped_rel_df['BIN'] = min_max_scaling(grouped_rel_df['BIN'])
fig1 = px.line(grouped_rel_df, y='BIN', x=label)

grouped_rel_df_2 = merged_df
# grouped_rel_df_2['yearalter2']= np.where(grouped_rel_df_2['yearalter2'].isnull(), grouped_rel_df_2['yearalter1'], grouped_rel_df_2['yearalter2'])
grouped_rel_df_2 = grouped_rel_df_2[grouped_rel_df_2[label]>1700]
grouped_rel_df_2[label] = grouped_rel_df_2[label].dropna().astype(int)
grouped_rel_df_2 = grouped_rel_df_2.groupby(label).count().reset_index()
grouped_rel_df_2['BIN'] = min_max_scaling(grouped_rel_df_2['BIN'])
fig2 = px.line(grouped_rel_df_2, y='BIN', x=label)

fig1.data[0].line.color = "pink"
fig = go.Figure(data = fig1.data + fig2.data)
fig.show()
/var/folders/lc/x87wxbbx5w192w4t4j8hx0w00000gn/T/ipykernel_50674/3883305759.py:10: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/lc/x87wxbbx5w192w4t4j8hx0w00000gn/T/ipykernel_50674/3883305759.py:18: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import plotly.express as px
import plotly.graph_objects as go

grouped_rel_df_2 = merged_df
grouped_rel_df_2['unitstotal'] = grouped_rel_df_2['unitstotal'].dropna().astype(int)
grouped_rel_df_2 = grouped_rel_df_2.groupby('unitstotal').count().reset_index()

fig = px.line(grouped_rel_df_2, y='BIN', x='unitstotal')
fig.show()
import plotly.express as px
import plotly.graph_objects as go

fig1 = px.line(merged_df, y='BIN', x='unitstotal')
fig2 = px.line(merged_df, y='BIN', x='bldgarea')

# fig2.data[0].line.color = "pink"
# fig = go.Figure(data = fig1.data + fig2.data)
# fig.show()
# test = merged_df.groupby('yearalter2').sort_values().count()
# test
test = merged_df.sort_values('yearalter2')
test = test['yearalter2'].dropna()[test['yearalter2']>0]
test
1036053    1930.0
1081026    1930.0
970083     1931.0
865551     1941.0
1046073    1950.0
            ...  
431134     2022.0
368524     2022.0
33927      2022.0
814476     2022.0
126084     2022.0
Name: yearalter2, Length: 20078, dtype: float64
merged_df['yearalter2']= np.where(merged_df['yearalter2'].isnull(), merged_df['yearalter1'], merged_df['yearalter2'])
merged_df.shape
(1084656, 31)
# cleaned_df = merged_df
merged_df = merged_df.drop(['Unnamed: 0_x',
                           'CNSTRCT_YR',
                            'LSTMODDATE',
                             'LSTSTATYPE',
                           'Unnamed: 0_y',
                            'firecomp',
                           'overlay1',
                             'overlay2',
                             'ltdheight',             
                             'ownername',
                            'numfloors',
                          'proxcode',                      
                          'edesignum'         
                           ], axis=1)
list(merged_df.columns)
['the_geom',
 'NAME',
 'BIN',
 'HEIGHTROOF',
 'MPLUTO_BBL',
 'bbl',
 'bct2020',
 'bldgclass',
 'landuse',
 'ownertype',
 'bldgarea',
 'numbldgs',
 'unitstotal',
 'yearbuilt',
 'yearalter1',
 'yearalter2',
 'latitude',
 'longitude']
firedata = pd.read_csv('Fire_Incident_Dispatch_Data.csv')
fires_reqd = firedata[firedata['INCIDENT_CLASSIFICATION_GROUP']=='Structural Fires']
fires_reqd = fires_reqd.sort_values('INCIDENT_DATETIME', ascending=False)
rel_fires = fires_reqd[['INCIDENT_CLASSIFICATION_GROUP','INCIDENT_DATETIME' ]]
rel_fires.to_csv('cleaned_fires.csv')
rel_fires = fires_reqd[fires_reqd['INCIDENT_DATETIME']]