= untar_data("kaggle_competitions::bluebook-bulldozer-remix"); path path
Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix')
Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix')
(#5) [Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix/bluebook-bulldozer-remix.zip'),Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix/Data Dictionary.xlsx'),Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix/Train'),Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix/TrainingData'),Path('/mnt/d/rahuketu/programming/AIKING_HOME/data/bluebook-bulldozer-remix/Valid')]
df_train = pd.read_csv(path/"Train/Train.csv", low_memory=False,parse_dates=['saledate'], infer_datetime_format=True); df_train.head()
df_test = pd.read_csv(path/"Valid/Valid.csv", parse_dates=['saledate'], infer_datetime_format=True); df_test.head()
SalesID | MachineID | ModelID | datasource | auctioneerID | YearMade | MachineHoursCurrentMeter | UsageBand | saledate | fiModelDesc | ... | Undercarriage_Pad_Width | Stick_Length | Thumb | Pattern_Changer | Grouser_Type | Backhoe_Mounting | Blade_Type | Travel_Controls | Differential_Type | Steering_Controls | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1222837 | 902859 | 1376 | 121 | 3 | 1000 | 0.0 | NaN | 2012-01-05 | 375L | ... | None or Unspecified | None or Unspecified | None or Unspecified | None or Unspecified | Double | NaN | NaN | NaN | NaN | NaN |
1 | 1222839 | 1048320 | 36526 | 121 | 3 | 2006 | 4412.0 | Medium | 2012-01-05 | TX300LC2 | ... | None or Unspecified | 12' 4" | None or Unspecified | Yes | Double | NaN | NaN | NaN | NaN | NaN |
2 | 1222841 | 999308 | 4587 | 121 | 3 | 2000 | 10127.0 | Medium | 2012-01-05 | 270LC | ... | None or Unspecified | 12' 4" | None or Unspecified | None or Unspecified | Double | NaN | NaN | NaN | NaN | NaN |
3 | 1222843 | 1062425 | 1954 | 121 | 3 | 1000 | 4682.0 | Low | 2012-01-05 | 892DLC | ... | None or Unspecified | None or Unspecified | None or Unspecified | None or Unspecified | Double | NaN | NaN | NaN | NaN | NaN |
4 | 1222845 | 1032841 | 4701 | 121 | 3 | 2002 | 8150.0 | Medium | 2012-01-04 | 544H | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Standard | Conventional |
5 rows × 52 columns
count | unique | top | freq | first | last | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SalesID | 401125.0 | NaN | NaN | NaN | NaT | NaT | 1919712.521503 | 909021.492667 | 1139246.0 | 1418371.0 | 1639422.0 | 2242707.0 | 6333342.0 |
SalePrice | 401125.0 | NaN | NaN | NaN | NaT | NaT | 31099.712848 | 23036.898502 | 4750.0 | 14500.0 | 24000.0 | 40000.0 | 142000.0 |
MachineID | 401125.0 | NaN | NaN | NaN | NaT | NaT | 1217902.517971 | 440991.954249 | 0.0 | 1088697.0 | 1279490.0 | 1468067.0 | 2486330.0 |
ModelID | 401125.0 | NaN | NaN | NaN | NaT | NaT | 6889.70298 | 6221.777842 | 28.0 | 3259.0 | 4604.0 | 8724.0 | 37198.0 |
datasource | 401125.0 | NaN | NaN | NaN | NaT | NaT | 134.66581 | 8.962237 | 121.0 | 132.0 | 132.0 | 136.0 | 172.0 |
auctioneerID | 380989.0 | NaN | NaN | NaN | NaT | NaT | 6.55604 | 16.976779 | 0.0 | 1.0 | 2.0 | 4.0 | 99.0 |
YearMade | 401125.0 | NaN | NaN | NaN | NaT | NaT | 1899.156901 | 291.797469 | 1000.0 | 1985.0 | 1995.0 | 2000.0 | 2013.0 |
MachineHoursCurrentMeter | 142765.0 | NaN | NaN | NaN | NaT | NaT | 3457.955353 | 27590.256413 | 0.0 | 0.0 | 0.0 | 3025.0 | 2483300.0 |
UsageBand | 69639 | 3 | Medium | 33985 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
saledate | 401125 | 3919 | 2009-02-16 00:00:00 | 1932 | 1989-01-17 | 2011-12-30 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fiModelDesc | 401125 | 4999 | 310G | 5039 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fiBaseModel | 401125 | 1950 | 580 | 19798 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fiSecondaryDesc | 263934 | 175 | C | 43235 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fiModelSeries | 56908 | 122 | II | 13202 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fiModelDescriptor | 71919 | 139 | L | 15875 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ProductSize | 190350 | 6 | Medium | 62274 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
fiProductClassDesc | 401125 | 74 | Backhoe Loader - 14.0 to 15.0 Ft Standard Digging Depth | 56166 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
state | 401125 | 53 | Florida | 63944 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ProductGroup | 401125 | 6 | TEX | 101167 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
ProductGroupDesc | 401125 | 6 | Track Excavators | 101167 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Drive_System | 104361 | 4 | Two Wheel Drive | 46139 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Enclosure | 400800 | 6 | OROPS | 173932 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Forks | 192077 | 2 | None or Unspecified | 178300 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Pad_Type | 79134 | 4 | None or Unspecified | 70614 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Ride_Control | 148606 | 3 | No | 77685 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Stick | 79134 | 2 | Standard | 48829 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Transmission | 183230 | 8 | Standard | 140328 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Turbocharged | 79134 | 2 | None or Unspecified | 75211 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Blade_Extension | 25219 | 2 | None or Unspecified | 24692 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Blade_Width | 25219 | 6 | 14' | 9615 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Enclosure_Type | 25219 | 3 | None or Unspecified | 21923 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Engine_Horsepower | 25219 | 2 | No | 23937 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Hydraulics | 320570 | 12 | 2 Valve | 141404 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Pushblock | 25219 | 2 | None or Unspecified | 19463 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Ripper | 104137 | 4 | None or Unspecified | 83452 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Scarifier | 25230 | 2 | None or Unspecified | 12719 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Tip_Control | 25219 | 3 | None or Unspecified | 16207 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Tire_Size | 94718 | 17 | None or Unspecified | 46339 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Coupler | 213952 | 3 | None or Unspecified | 184582 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Coupler_System | 43458 | 2 | None or Unspecified | 40430 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Grouser_Tracks | 43362 | 2 | None or Unspecified | 40515 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Hydraulics_Flow | 43362 | 3 | Standard | 42784 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Track_Type | 99153 | 2 | Steel | 84880 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Undercarriage_Pad_Width | 99872 | 19 | None or Unspecified | 79651 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Stick_Length | 99218 | 29 | None or Unspecified | 78820 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Thumb | 99288 | 3 | None or Unspecified | 83093 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Pattern_Changer | 99218 | 3 | None or Unspecified | 90255 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Grouser_Type | 99153 | 3 | Double | 84653 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Backhoe_Mounting | 78672 | 2 | None or Unspecified | 78652 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Blade_Type | 79833 | 10 | PAT | 38612 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Travel_Controls | 79834 | 7 | None or Unspecified | 69923 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Differential_Type | 69411 | 4 | Standard | 68073 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Steering_Controls | 69369 | 5 | Conventional | 68679 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Utility for generating multiple features from single dataframe column
ColExpanderTransform (names, func, func_kw_args)
Takes a function and it’s argument generating multiple columns from single dataframe column. This transform provides user supplied names
Utilities for generating features from date columns.
get_dateparts (X, attrs=[], keep=False)
get_dateparts_transformer (time=False, keep=False)
original | year | month | week | day | dayofweek | dayofyear | is_month_end | is_month_start | is_quarter_end | is_quarter_start | is_year_end | is_year_start | elapsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2006-11-16 | 2006 | 11 | 46 | 16 | 3 | 320 | False | False | False | False | False | False | 1163635200 |
1 | 2004-03-26 | 2004 | 3 | 13 | 26 | 4 | 86 | False | False | False | False | False | False | 1080259200 |
2 | 2004-02-26 | 2004 | 2 | 9 | 26 | 3 | 57 | False | False | False | False | False | False | 1077753600 |
3 | 2011-05-19 | 2011 | 5 | 20 | 19 | 3 | 139 | False | False | False | False | False | False | 1305763200 |
4 | 2009-07-23 | 2009 | 7 | 30 | 23 | 3 | 204 | False | False | False | False | False | False | 1248307200 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
401120 | 2011-11-02 | 2011 | 11 | 44 | 2 | 2 | 306 | False | False | False | False | False | False | 1320192000 |
401121 | 2011-11-02 | 2011 | 11 | 44 | 2 | 2 | 306 | False | False | False | False | False | False | 1320192000 |
401122 | 2011-11-02 | 2011 | 11 | 44 | 2 | 2 | 306 | False | False | False | False | False | False | 1320192000 |
401123 | 2011-10-25 | 2011 | 10 | 43 | 25 | 1 | 298 | False | False | False | False | False | False | 1319500800 |
401124 | 2011-10-25 | 2011 | 10 | 43 | 25 | 1 | 298 | False | False | False | False | False | False | 1319500800 |
401125 rows × 14 columns
Utility for converting Categorical Column to Numerical Column
NumericalEncoder (categories=None)
Takes an object column which can be categorized and converts the same to numerical code.
Type | Default | Details | |
---|---|---|---|
categories | NoneType | None | Providing categories in init helps in presetting category order |
get_cat_dict (df, max_n_cat=None)
display_cat (cat_dict)
0 | 1 | |
---|---|---|
0 | UsageBand | [High, Low, Medium] |
1 | fiModelDesc | [100C, 104, 1066, 1066E, 1080, 1080B, 1088, 1088CK, 1088LT, 1088TTL, 10B, 10C, 10DG, 110, 1105, 110S, 110TLB, 110Z, 110Z-2, 112, 112E, 112F, 115, 1150, 1150B, 1150BLGP, 1150C, 1150D, 1150E, 1150ELT, 1150G, 1150GLGP, 1150GLT, 1150H, 1150HLGP, 1150HLT, 1150HWT, 1150K, 1150KLGPSERIES3, 1150KWT, 1150KXLT, 1150KXLTIII, 115SRDZ, 115Z, 115ZIII, 115ZIV, 115ZIV-2, 115ZV, 116, 1166, 118, 1187C, 1188, 1188LC, 1188P, 118B, 118C, 11B, 11C, 12, 120, 120B, 120C, 120CLC, 120D, 120E, 120G, 120H, 120HNA, 120LC, 120M , 125, 125A, 125B, 125C, 125CKBNA, 1280, 1280B, 1288, 12E, 12F, 12G, 12H, 12HNA, 12JX, 130, 1300, 1300D, 130G, 130LC, 130LX, 1340XL, 135, 135C, 135CRTS, 135H, 135HNA, 135MSR SPIN ACE, 135SR, 135SRLC, ...] |
2 | fiBaseModel | [10, 100, 104, 1066, 1080, 1088, 10DG, 11, 110, 1105, 112, 115, 1150, 116, 1166, 118, 1187, 1188, 12, 120, 125, 1280, 1288, 130, 1300, 1340, 135, 137, 14, 140, 1400, 143, 1450, 15, 150, 1500, 153, 155, 1550, 16, 160, 1600, 163, 165, 1650, 166, 17, 170, 1700, 1737, 1740, 175, 1750, 1760XL, 17ZTS, 18, 1800, 1818, 1825, 1830, 1835, 1838, 1840, 1845, 185, 1850, 190, 1900, 198, 20, 200, 2000, 2022, 2026, 2040, 2042, 2044, 205, 2050, 2054, 2060, 2060XL, 2064, 2066, 2070, 2074, 2076, 208, 2086, 2095, 2099, 21, 210, 2105, 2109, 211, 212, 213, 213LC, 214, ...] |
3 | fiSecondaryDesc | [ MSR SPIN ACE, #NAME?, -2, -3, -5, -5L, -6, -7, 0.7, 1, 2, 3, 5, 7, A, AA, AB, AG, AW, AX, B, B , BEC, BL, BLGP, BLGPPS, BZ, BZD, C, C , CE, CH, CK, CKB, CL, CLR, CM, CR, CS, CX, D, DC, DL, DT, DX, DXT, DZ, E, EG, EL, ESS, EST, EW, EX, F, FR, G, GT, H, H , H90, HAG, HD, HDS, HDSL, HF, HL, HLGP, HLS, HX, HZ, IV, J, JX, K, L, LC, LC7A, LC7LR, LCD, LCH, LCLR, LCM, LD, LE, LGP, LR, LS, LT, LX, M, M , MC, MR, MRX, MSR, MSR SPIN ACE, MT, MU, MXT, ...] |
4 | fiModelSeries | [ III, #NAME?, -1, -1.50E+01, -11, -12, -15, -16, -17, -18, -1B, -1C, -1L, -2, -20, -21, -21A, -2A, -2C, -2E, -2LC, -2N, -3, -3C, -3EO, -3H, -3L, -3LC, -3LK, -3M, -3MC, -3PT, -4, -5, -5A, -5E, -5F, -5H, -5L, -5LC, -6, -6A, -6B, -6C, -6E, -6K, -6LC, -6LK, -7, -7B, -7E, -7K, -8, -8E, 1, 12, 14FT, 15, 16, 17, 18, 1970, 2, 20, 21KomStat, 21KomStatII, 22, 2B, 2T, 3, 3A, 3C, 4, 5, 5N, 6, 6.00E+00, 6F, 6L, 6LE, 6LK, 7, 7.00E+00, 7A, 7L, 8, A, AWS, D, D7, E, EX, Elite, FASTRRACK, G, GALEO, H, II, III, IV, ...] |
5 | fiModelDescriptor | [ 14FT, LGP, SUPER, XLT, XT, ZX, (BLADE RUNNER), 1, 2, 2.00E+00, 2N, 3, 3.00E+00, 3C, 3L, 3NO, 4WD, 4x4x4, 5, 6, 6K, 7, 7.00E+00, 7A, 8, A, AE0, AVANCE, B, BE, C, CK, CR, CRSB, CUSTOM, DA, DELUXE, DHP, DINGO, DLL, DT, DW, E, ESL, G, GALEO, H, H5, HD, HF, HSD, HT, High Lift, HighLift, II, III, IT, IV, K, K3, K5, KA, KBNA, L, LC, LC8, LCH, LCR, LCRTS, LE, LGP, LGPVP, LITRONIC, LK, LL, LM, LN, LR, LRC, LRR, LS, LT, LU, LX, LongReach, M, MC, ME, MH, N, NLC, NSUC, P, PLUS, PRO, RR, RTS, S, SA, SB, ...] |
6 | ProductSize | [Compact, Large, Large / Medium, Medium, Mini, Small] |
7 | fiProductClassDesc | [Backhoe Loader - 0.0 to 14.0 Ft Standard Digging Depth, Backhoe Loader - 14.0 to 15.0 Ft Standard Digging Depth, Backhoe Loader - 15.0 to 16.0 Ft Standard Digging Depth, Backhoe Loader - 16.0 + Ft Standard Digging Depth, Backhoe Loader - Unidentified, Hydraulic Excavator, Track - 0.0 to 2.0 Metric Tons, Hydraulic Excavator, Track - 11.0 to 12.0 Metric Tons, Hydraulic Excavator, Track - 12.0 to 14.0 Metric Tons, Hydraulic Excavator, Track - 14.0 to 16.0 Metric Tons, Hydraulic Excavator, Track - 150.0 to 300.0 Metric Tons, Hydraulic Excavator, Track - 16.0 to 19.0 Metric Tons, Hydraulic Excavator, Track - 19.0 to 21.0 Metric Tons, Hydraulic Excavator, Track - 2.0 to 3.0 Metric Tons, Hydraulic Excavator, Track - 21.0 to 24.0 Metric Tons, Hydraulic Excavator, Track - 24.0 to 28.0 Metric Tons, Hydraulic Excavator, Track - 28.0 to 33.0 Metric Tons, Hydraulic Excavator, Track - 3.0 to 4.0 Metric Tons, Hydraulic Excavator, Track - 300.0 + Metric Tons, Hydraulic Excavator, Track - 33.0 to 40.0 Metric Tons, Hydraulic Excavator, Track - 4.0 to 5.0 Metric Tons, Hydraulic Excavator, Track - 4.0 to 6.0 Metric Tons, Hydraulic Excavator, Track - 40.0 to 50.0 Metric Tons, Hydraulic Excavator, Track - 5.0 to 6.0 Metric Tons, Hydraulic Excavator, Track - 50.0 to 66.0 Metric Tons, Hydraulic Excavator, Track - 6.0 to 8.0 Metric Tons, Hydraulic Excavator, Track - 66.0 to 90.0 Metric Tons, Hydraulic Excavator, Track - 8.0 to 11.0 Metric Tons, Hydraulic Excavator, Track - 90.0 to 150.0 Metric Tons, Hydraulic Excavator, Track - Unidentified, Hydraulic Excavator, Track - Unidentified (Compact Construction), Motorgrader - 130.0 to 145.0 Horsepower, Motorgrader - 145.0 to 170.0 Horsepower, Motorgrader - 170.0 to 200.0 Horsepower, Motorgrader - 200.0 + Horsepower, Motorgrader - 45.0 to 130.0 Horsepower, Motorgrader - Unidentified, Skid Steer Loader - 0.0 to 701.0 Lb Operating Capacity, Skid Steer Loader - 1251.0 to 1351.0 Lb Operating Capacity, Skid Steer Loader - 1351.0 to 1601.0 Lb Operating Capacity, Skid Steer Loader - 1601.0 to 1751.0 Lb Operating Capacity, Skid Steer Loader - 1751.0 to 2201.0 Lb Operating Capacity, Skid Steer Loader - 2201.0 to 2701.0 Lb Operating Capacity, Skid Steer Loader - 2701.0+ Lb Operating Capacity, Skid Steer Loader - 701.0 to 976.0 Lb Operating Capacity, Skid Steer Loader - 976.0 to 1251.0 Lb Operating Capacity, Skid Steer Loader - Unidentified, Track Type Tractor, Dozer - 105.0 to 130.0 Horsepower, Track Type Tractor, Dozer - 130.0 to 160.0 Horsepower, Track Type Tractor, Dozer - 160.0 to 190.0 Horsepower, Track Type Tractor, Dozer - 190.0 to 260.0 Horsepower, Track Type Tractor, Dozer - 20.0 to 75.0 Horsepower, Track Type Tractor, Dozer - 260.0 + Horsepower, Track Type Tractor, Dozer - 75.0 to 85.0 Horsepower, Track Type Tractor, Dozer - 85.0 to 105.0 Horsepower, Track Type Tractor, Dozer - Unidentified, Wheel Loader - 0.0 to 40.0 Horsepower, Wheel Loader - 100.0 to 110.0 Horsepower, Wheel Loader - 1000.0 + Horsepower, Wheel Loader - 110.0 to 120.0 Horsepower, Wheel Loader - 120.0 to 135.0 Horsepower, Wheel Loader - 135.0 to 150.0 Horsepower, Wheel Loader - 150.0 to 175.0 Horsepower, Wheel Loader - 175.0 to 200.0 Horsepower, Wheel Loader - 200.0 to 225.0 Horsepower, Wheel Loader - 225.0 to 250.0 Horsepower, Wheel Loader - 250.0 to 275.0 Horsepower, Wheel Loader - 275.0 to 350.0 Horsepower, Wheel Loader - 350.0 to 500.0 Horsepower, Wheel Loader - 40.0 to 60.0 Horsepower, Wheel Loader - 500.0 to 1000.0 Horsepower, Wheel Loader - 60.0 to 80.0 Horsepower, Wheel Loader - 80.0 to 90.0 Horsepower, Wheel Loader - 90.0 to 100.0 Horsepower, Wheel Loader - Unidentified] |
8 | state | [Alabama, Alaska, Arizona, Arkansas, California, Colorado, Connecticut, Delaware, Florida, Georgia, Hawaii, Idaho, Illinois, Indiana, Iowa, Kansas, Kentucky, Louisiana, Maine, Maryland, Massachusetts, Michigan, Minnesota, Mississippi, Missouri, Montana, Nebraska, Nevada, New Hampshire, New Jersey, New Mexico, New York, North Carolina, North Dakota, Ohio, Oklahoma, Oregon, Pennsylvania, Puerto Rico, Rhode Island, South Carolina, South Dakota, Tennessee, Texas, Unspecified, Utah, Vermont, Virginia, Washington, Washington DC, West Virginia, Wisconsin, Wyoming] |
9 | ProductGroup | [BL, MG, SSL, TEX, TTT, WL] |
10 | ProductGroupDesc | [Backhoe Loaders, Motor Graders, Skid Steer Loaders, Track Excavators, Track Type Tractors, Wheel Loader] |
11 | Drive_System | [All Wheel Drive, Four Wheel Drive, No, Two Wheel Drive] |
12 | Enclosure | [EROPS, EROPS AC, EROPS w AC, NO ROPS, None or Unspecified, OROPS] |
13 | Forks | [None or Unspecified, Yes] |
14 | Pad_Type | [Grouser, None or Unspecified, Reversible, Street] |
15 | Ride_Control | [No, None or Unspecified, Yes] |
16 | Stick | [Extended, Standard] |
17 | Transmission | [AutoShift, Autoshift, Direct Drive, Hydrostatic, None or Unspecified, Powershift, Powershuttle, Standard] |
18 | Turbocharged | [None or Unspecified, Yes] |
19 | Blade_Extension | [None or Unspecified, Yes] |
20 | Blade_Width | [12', 13', 14', 16', <12', None or Unspecified] |
21 | Enclosure_Type | [High Profile, Low Profile, None or Unspecified] |
22 | Engine_Horsepower | [No, Variable] |
23 | Hydraulics | [2 Valve, 3 Valve, 4 Valve, Auxiliary, Base + 1 Function, Base + 2 Function, Base + 3 Function, Base + 4 Function, Base + 5 Function, Base + 6 Function, None or Unspecified, Standard] |
24 | Pushblock | [None or Unspecified, Yes] |
25 | Ripper | [Multi Shank, None or Unspecified, Single Shank, Yes] |
26 | Scarifier | [None or Unspecified, Yes] |
27 | Tip_Control | [None or Unspecified, Sideshift & Tip, Tip] |
28 | Tire_Size | [10 inch, 10", 13", 14", 15.5, 15.5", 17.5, 17.5", 20.5, 20.5", 23.1", 23.5, 23.5", 26.5, 29.5, 7.0", None or Unspecified] |
29 | Coupler | [Hydraulic, Manual, None or Unspecified] |
30 | Coupler_System | [None or Unspecified, Yes] |
31 | Grouser_Tracks | [None or Unspecified, Yes] |
32 | Hydraulics_Flow | [High Flow, None or Unspecified, Standard] |
33 | Track_Type | [Rubber, Steel] |
34 | Undercarriage_Pad_Width | [14 inch, 15 inch, 16 inch, 18 inch, 20 inch, 22 inch, 24 inch, 25 inch, 26 inch, 27 inch, 28 inch, 30 inch, 31 inch, 31.5 inch, 32 inch, 33 inch, 34 inch, 36 inch, None or Unspecified] |
35 | Stick_Length | [10' 10", 10' 2", 10' 6", 11' 0", 11' 10", 12' 10", 12' 4", 12' 8", 13' 10", 13' 7", 13' 9", 14' 1", 15' 4", 15' 9", 19' 8", 24' 3", 6' 3", 7' 10", 8' 10", 8' 2", 8' 4", 8' 6", 9' 10", 9' 2", 9' 5", 9' 6", 9' 7", 9' 8", None or Unspecified] |
36 | Thumb | [Hydraulic, Manual, None or Unspecified] |
37 | Pattern_Changer | [No, None or Unspecified, Yes] |
38 | Grouser_Type | [Double, Single, Triple] |
39 | Backhoe_Mounting | [None or Unspecified, Yes] |
40 | Blade_Type | [Angle, Coal, Landfill, No, None or Unspecified, PAT, Semi U, Straight, U, VPAT] |
41 | Travel_Controls | [1 Speed, 2 Pedal, Differential Steer, Finger Tip, Lever, None or Unspecified, Pedal] |
42 | Differential_Type | [Limited Slip, Locking, No Spin, Standard] |
43 | Steering_Controls | [Command Control, Conventional, Four Wheel Standard, No, Wheel] |
get_num_dummy_cols (df, cols_cat, max_n_cat=None)
Determines which columns need to be converted to numerical or dummies.
EmptyDF ()
Transformer generating an empty dataframe
0 |
1 |
2 |
3 |
4 |
... |
401120 |
401121 |
401122 |
401123 |
401124 |
401125 rows × 0 columns
mapper_or_empty (feature_def, input_df=True, df_out=True)
gen_feature_layer (X, feature_specs={})
gen_cat_num_features (cols_cat_num, cat_dict, cat_num_dict, scale_dict, scale_var_cat)
get_default_feature_def (X, skip_flds=None, ignored_flds=None, max_n_cat=None, na_exclude_cols=[], scale_var_num=True, scale_var_cat=False, scale_dict={'class': <class 'sklearn.preprocessing._data.StandardScaler'>}, cat_num_dict={'class': <class '__main__.NumericalEncoder'>, 'categories': None}, cat_dummy_dict={'class': <class 'sklearn. preprocessing._encoders.OneHotEncoder'>, 'handle_unknown': 'ignore'}, imputer_dict={'class': <class 'sklearn.impute._base.SimpleImputer'>, 'strategy': 'median'}, include_time_cols=True, keep_dt_cols=False, cat_dict=None)
Type | Default | Details | |
---|---|---|---|
X | Input Dataframe | ||
skip_flds | NoneType | None | Skip columns for transformation. Output dataframe will not contain these columns |
ignored_flds | NoneType | None | Ignore columns for transformation. Output dataframe will contain these columns |
max_n_cat | NoneType | None | Count above which all the columns are converted to numerical column |
na_exclude_cols | list | [] | Cols for which nan column is not added |
scale_var_num | bool | True | Flag to scale numerical column |
scale_var_cat | bool | False | Flag to scale numerical column generated by categorical column |
scale_dict | dict | {‘class’: <class ‘sklearn.preprocessing._data.StandardScaler’>} | Transformer/ Encoder for Scaling |
cat_num_dict | dict | {‘class’: <class ‘main.NumericalEncoder’>, ‘categories’: None} | Transformer/ Encoder for Numericalizing categorical column |
cat_dummy_dict | dict | {‘class’: <class ‘sklearn.preprocessing._encoders.OneHotEncoder’>, ‘handle_unknown’: ‘ignore’} | Transformer/ Encoder for Generating column dummies |
imputer_dict | dict | {‘class’: <class ‘sklearn.impute._base.SimpleImputer’>, ‘strategy’: ‘median’} | Transformer/ Encoder for Imputing Columns |
include_time_cols | bool | True | Flag to indicate time feature evaluation |
keep_dt_cols | bool | False | Flag to indicate keeping datetime column obj/ categories |
cat_dict | NoneType | None | Dictionary with columns:[Array] of preset categories |
cat_dict = get_cat_dict(df_train, max_n_cat=10)
feature_def = get_default_feature_def(df_train.drop(['SalePrice'], axis=1),
max_n_cat=10,
cat_dict = cat_dict,
scale_var_cat=True,
cat_num_dict= {'class':NumericalEncoder, 'categories':None} )
feature_def[:3]
[('auctioneerID',
[FunctionTransformer(func=<function isna>)],
{'suffix': '_nan'}),
('MachineHoursCurrentMeter',
[FunctionTransformer(func=<function isna>)],
{'suffix': '_nan'}),
('fiSecondaryDesc',
[FunctionTransformer(func=<function isna>)],
{'suffix': '_nan'})]
layer_spec_default = (get_default_feature_def,
{
'skip_flds':None,
'ignored_flds':None,
'max_n_cat':10,
'na_exclude_cols':[],
'scale_var_num':True,
'scale_var_cat':False,
'scale_dict':{'class': StandardScaler},
'cat_num_dict':{'class':NumericalEncoder, 'categories':None},
'cat_dummy_dict':{'class':OneHotEncoder,'handle_unknown':'ignore'},
'imputer_dict':{'class':SimpleImputer,
'strategy':'median'},
'include_time_cols':True,
'keep_dt_cols':False,
'cat_dict':None
}
)
Proc (layer_specs=None, layer_spec_default=(<function get_default_feature_def at 0x7fad1d1efe50>, {'skip_flds': None, 'ignored_flds': None, 'max_n_cat': 10, 'na_exclude_cols': [], 'scale_var_num': True, 'scale_var_cat': False, 'scale_dict': {'class': <class 'sklearn.preprocessing._data.StandardScaler'>}, 'cat_num_dict': {'class': <class '__main__.NumericalEncoder'>, 'categories': None}, 'cat_dummy_dict': {'class': <class 'sklearn.preprocessing._encoders.OneHotEncoder'>, 'handle_unknown': 'ignore'}, 'imputer_dict': {'class': <class 'sklearn.impute._base.SimpleImputer'>, 'strategy': 'median'}, 'include_time_cols': True, 'keep_dt_cols': False, 'cat_dict': None}))
This transformer generates all the features based on provided layerspecs. It can combining multiple sequential layers. If no layer is provided. It chooses a default layer as described above.
Type | Default | Details | |
---|---|---|---|
layer_specs | NoneType | None | Array of layers |
layer_spec_default | tuple | (<function get_default_feature_def at 0x7fad1d1efe50>, {‘skip_flds’: None, ‘ignored_flds’: None, ‘max_n_cat’: 10, ‘na_exclude_cols’: [], ‘scale_var_num’: True, ‘scale_var_cat’: False, ‘scale_dict’: {‘class’: <class ‘sklearn.preprocessing._data.StandardScaler’>}, ‘cat_num_dict’: {‘class’: <class ‘main.NumericalEncoder’>, ‘categories’: None}, ‘cat_dummy_dict’: {‘class’: <class ‘sklearn.preprocessing._encoders.OneHotEncoder’>, ‘handle_unknown’: ‘ignore’}, ‘imputer_dict’: {‘class’: <class ‘sklearn.impute._base.SimpleImputer’>, ‘strategy’: ‘median’}, ‘include_time_cols’: True, ‘keep_dt_cols’: False, ‘cat_dict’: None}) | Default layer as described above (layerfunc, kw_args) |
def get_model_pipeline(max_n_cat=0,
cat_dict=None,
scale_dict={'class': StandardScaler},
cat_num_dict={'class':NumericalEncoder,'categories':None},
cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
imputer_dict={'class':SimpleImputer, 'strategy':'median'},
):
layer_spec_default = (get_default_feature_def,
{
'skip_flds':None,
'ignored_flds':None,
'max_n_cat':max_n_cat,
'na_exclude_cols':[],
'scale_var_num':True,
'scale_var_cat':False,
'scale_dict':scale_dict,
'cat_num_dict':cat_num_dict,
'cat_dummy_dict':cat_dummy_dict,
'imputer_dict':imputer_dict,
'include_time_cols':True,
'keep_dt_cols':False,
'cat_dict':cat_dict
}
)
layer_specs = [layer_spec_default]
proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
model = RandomForestRegressor()
pipeline = make_pipeline(proc, model); pipeline
return pipeline
get_scorer_dict (scorer_names=['r2', 'neg_root_mean_squared_error', 'explained_variance', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error'])
get_score (pipeline, X_train, y_train, X_valid, y_valid, scorers={'r2': make_scorer(r2_score)})
Type | Default | Details | |
---|---|---|---|
pipeline | Estimator or pipeline | ||
X_train | Training data | ||
y_train | Training data target | ||
X_valid | Validation data | ||
y_valid | Validation data target | ||
scorers | dict | {‘r2’: make_scorer(r2_score)} | Scoring dict subset of sklearn scorer |
score_df = get_score(pipeline, X_model, y_model, X_valid, y_valid, scorers=get_scorer_dict()); score_df
Training | Validation | |
---|---|---|
Metric | ||
r2 | 0.856186 | 0.826902 |
neg_root_mean_squared_error | -0.262653 | -0.301449 |
explained_variance | 0.856189 | 0.830701 |
neg_median_absolute_error | -0.143690 | -0.183203 |
neg_mean_absolute_percentage_error | -0.019180 | -0.022641 |
score_df2 = score_df.unstack().to_frame().T
score_df2.columns = score_df2.columns.map('{0[0]}_{0[1]}'.format)
score_df2
Training_r2 | Training_neg_root_mean_squared_error | Training_explained_variance | Training_neg_median_absolute_error | Training_neg_mean_absolute_percentage_error | Validation_r2 | Validation_neg_root_mean_squared_error | Validation_explained_variance | Validation_neg_median_absolute_error | Validation_neg_mean_absolute_percentage_error | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.856186 | -0.262653 | 0.856189 | -0.14369 | -0.01918 | 0.826902 | -0.301449 | 0.830701 | -0.183203 | -0.022641 |
timeseries_cv (df_model, sort_fld, y_fld, pipeline_callback_dict, y_mod_func=None, scorers={'r2': make_scorer(r2_score)}, n_train=15000, n_test=12000, n_splits=10)
Type | Default | Details | |
---|---|---|---|
df_model | Dataset for modelling training + validation | ||
sort_fld | Field for sorting dataframe required for ordering timeseries split | ||
y_fld | Field/colname for yaxis | ||
pipeline_callback_dict | {‘func’: get_model_pipeline, ‘func_kwargs’:{‘max_n_cat’:0, ‘cat_dict’:None}} | ||
y_mod_func | NoneType | None | Function/ transformation applied to target variable |
scorers | dict | {‘r2’: make_scorer(r2_score)} | Scoring dict subset of sklearn scorer |
n_train | int | 15000 | max-number of samples in training |
n_test | int | 12000 | max-number of samples in testing |
n_splits | int | 10 | number of splits |
df_cv = timeseries_cv(df_model, 'saledate', 'SalePrice',
pipeline_callback_dict={'func': get_model_pipeline, 'func_kwargs':{'max_n_cat':0, 'cat_dict':None}},
y_mod_func=np.log,
scorers = get_scorer_dict(),
n_train=15000, n_test=12000, n_splits=10); df_cv
''
Training | Validation | set | train_start | train_end | valid_start | valid_end | |
---|---|---|---|---|---|---|---|
Metric | |||||||
r2 | 0.986255 | 0.854053 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
neg_root_mean_squared_error | -0.081914 | -0.264194 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
explained_variance | 0.986255 | 0.861748 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
neg_median_absolute_error | -0.037508 | -0.148583 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
neg_mean_absolute_percentage_error | -0.005579 | -0.019604 | 1 | 2008-02-26 | 2008-06-28 | 2008-06-28 | 2008-11-07 |
r2 | 0.985485 | 0.839856 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
neg_root_mean_squared_error | -0.083591 | -0.280460 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
explained_variance | 0.985486 | 0.863647 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
neg_median_absolute_error | -0.040666 | -0.168718 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
neg_mean_absolute_percentage_error | -0.005827 | -0.021360 | 2 | 2008-02-26 | 2008-11-07 | 2008-11-07 | 2009-02-16 |
r2 | 0.985958 | 0.855100 | 3 | 2008-03-04 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
neg_root_mean_squared_error | -0.082704 | -0.268827 | 3 | 2008-03-04 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
explained_variance | 0.985959 | 0.868499 | 3 | 2008-03-04 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
neg_median_absolute_error | -0.039072 | -0.160533 | 3 | 2008-03-04 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
neg_mean_absolute_percentage_error | -0.005711 | -0.020542 | 3 | 2008-03-04 | 2009-02-16 | 2009-02-16 | 2009-05-05 |
r2 | 0.986205 | 0.855375 | 4 | 2008-03-04 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
neg_root_mean_squared_error | -0.083192 | -0.267737 | 4 | 2008-03-04 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
explained_variance | 0.986205 | 0.855806 | 4 | 2008-03-04 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
neg_median_absolute_error | -0.040010 | -0.155045 | 4 | 2008-03-04 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
neg_mean_absolute_percentage_error | -0.005781 | -0.020048 | 4 | 2008-03-04 | 2009-05-05 | 2009-05-05 | 2009-08-19 |
r2 | 0.985169 | 0.860407 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
neg_root_mean_squared_error | -0.086075 | -0.258262 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
explained_variance | 0.985169 | 0.860460 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
neg_median_absolute_error | -0.042436 | -0.145335 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
neg_mean_absolute_percentage_error | -0.006073 | -0.019179 | 5 | 2008-03-05 | 2009-08-19 | 2009-08-19 | 2009-12-04 |
r2 | 0.985437 | 0.844689 | 6 | 2008-03-18 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
neg_root_mean_squared_error | -0.083634 | -0.274846 | 6 | 2008-03-18 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
explained_variance | 0.985438 | 0.853109 | 6 | 2008-03-18 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
neg_median_absolute_error | -0.040839 | -0.159299 | 6 | 2008-03-18 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
neg_mean_absolute_percentage_error | -0.005867 | -0.020023 | 6 | 2008-03-18 | 2009-12-04 | 2009-12-04 | 2010-03-30 |
r2 | 0.984997 | 0.842430 | 7 | 2008-07-16 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
neg_root_mean_squared_error | -0.085599 | -0.279458 | 7 | 2008-07-16 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
explained_variance | 0.984998 | 0.843239 | 7 | 2008-07-16 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
neg_median_absolute_error | -0.039553 | -0.153045 | 7 | 2008-07-16 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
neg_mean_absolute_percentage_error | -0.005894 | -0.020454 | 7 | 2008-07-16 | 2010-03-30 | 2010-03-30 | 2010-08-24 |
r2 | 0.984073 | 0.870446 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
neg_root_mean_squared_error | -0.088414 | -0.254219 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
explained_variance | 0.984075 | 0.870939 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
neg_median_absolute_error | -0.042612 | -0.145637 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
neg_mean_absolute_percentage_error | -0.006137 | -0.018818 | 8 | 2008-11-12 | 2010-08-24 | 2010-08-24 | 2011-01-26 |
r2 | 0.985310 | 0.856385 | 9 | 2008-11-12 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
neg_root_mean_squared_error | -0.085470 | -0.267713 | 9 | 2008-11-12 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
explained_variance | 0.985311 | 0.865565 | 9 | 2008-11-12 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
neg_median_absolute_error | -0.041587 | -0.149833 | 9 | 2008-11-12 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
neg_mean_absolute_percentage_error | -0.005997 | -0.019396 | 9 | 2008-11-12 | 2011-01-26 | 2011-01-26 | 2011-04-28 |
r2 | 0.986415 | 0.855716 | 10 | 2008-11-12 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
neg_root_mean_squared_error | -0.082733 | -0.268315 | 10 | 2008-11-12 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
explained_variance | 0.986419 | 0.855830 | 10 | 2008-11-12 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
neg_median_absolute_error | -0.038141 | -0.150238 | 10 | 2008-11-12 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
neg_mean_absolute_percentage_error | -0.005631 | -0.019745 | 10 | 2008-11-12 | 2011-04-28 | 2011-04-28 | 2011-08-31 |
df_cv.loc['neg_root_mean_squared_error'][['train_end', 'Training', 'Validation']].set_index('train_end')
Training | Validation | |
---|---|---|
train_end | ||
2008-06-28 | -0.081914 | -0.264194 |
2008-11-07 | -0.083591 | -0.280460 |
2009-02-16 | -0.082704 | -0.268827 |
2009-05-05 | -0.083192 | -0.267737 |
2009-08-19 | -0.086075 | -0.258262 |
2009-12-04 | -0.083634 | -0.274846 |
2010-03-30 | -0.085599 | -0.279458 |
2010-08-24 | -0.088414 | -0.254219 |
2011-01-26 | -0.085470 | -0.267713 |
2011-04-28 | -0.082733 | -0.268315 |
Training | Validation | |
---|---|---|
0 | -0.084332 | -0.268403 |
We can conclude overfitting on training set with expected error on unseen data to be around 0.267758
<AxesSubplot:xlabel='train_end'>