import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from word2number import w2n
#Sample data
df = pd.DataFrame({'year':[2010,2015,2020,2025],'count_1':[1,'*',3,9],'count_2':[1,'s',3,2],'count_3':[1e-20,'three',3,'two'],'count_4':['5.8',6,np.nan,5],'count_5':[4.1,4,False,4],'count_6':[5,6,7,'fifty']})
df
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010 | 1 | 1 | 1e-20 | 5.8 | 4.1 | 5 |
1 | 2015 | * | s | three | 6 | 4 | 6 |
2 | 2020 | 3 | 3 | 3 | NaN | False | 7 |
3 | 2025 | 9 | 2 | two | 5 | 4 | fifty |
Using numpy's to_numeric we can force all values to numeric.
#pandas to numeric
def to_num(value):
return pd.to_numeric(value, errors='coerce')
df_1 = df.applymap(to_num)
df_1
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010 | 1.0 | 1.0 | 1.000000e-20 | 5.8 | 4.1 | 5.0 |
1 | 2015 | NaN | NaN | NaN | 6.0 | 4 | 6.0 |
2 | 2020 | 3.0 | 3.0 | 3.000000e+00 | NaN | False | 7.0 |
3 | 2025 | 9.0 | 2.0 | NaN | 5.0 | 4 | NaN |
Using sklearns SimpleImputer, you can fill missing values with the mean of each column.
#impute missing values
def impute_num(df):
x = SimpleImputer(missing_values=np.nan, strategy='mean').fit(np.array(df))
z = x.transform(np.array(df))
return pd.DataFrame(z, columns=df.columns)
df_2 = impute_num(df_1)
df_2
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010.0 | 1.000000 | 1.0 | 1.000000e-20 | 5.8 | 4.1 | 5.0 |
1 | 2015.0 | 4.333333 | 2.0 | 1.500000e+00 | 6.0 | 4.0 | 6.0 |
2 | 2020.0 | 3.000000 | 3.0 | 3.000000e+00 | 5.6 | 0.0 | 7.0 |
3 | 2025.0 | 9.000000 | 2.0 | 1.500000e+00 | 5.0 | 4.0 | 6.0 |
Turn string words to numbers throught the word2number package. (Theres also a num2word package)
#words to numbers
def s2n(value):
if isinstance(value, (str)):
try:
return w2n.word_to_num(value)
except:
return value
else:
return value
df_3 = df.applymap(s2n)
df_3
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010 | 1 | 1 | 1.000000e-20 | 5.8 | 4.1 | 5 |
1 | 2015 | * | s | 3.000000e+00 | 6 | 4 | 6 |
2 | 2020 | 3 | 3 | 3.000000e+00 | NaN | False | 7 |
3 | 2025 | 9 | 2 | 2.000000e+00 | 5 | 4 | 50 |
Turn Bools to NaN's. (If the column is all bools then better to translate as 0's and 1's)
#bool to nan
def bool_to_nan(value):
if isinstance(value,(bool)):
return np.nan
else:
return value
df.applymap(bool_to_nan)
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010 | 1 | 1 | 1e-20 | 5.8 | 4.1 | 5 |
1 | 2015 | * | s | three | 6 | 4.0 | 6 |
2 | 2020 | 3 | 3 | 3 | NaN | NaN | 7 |
3 | 2025 | 9 | 2 | two | 5 | 4.0 | fifty |
Chain functions together:
#bool to nan then string to number then all values numeric then impute values
impute_num((df.applymap(bool_to_nan).applymap(s2n).applymap(to_num)))
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010.0 | 1.000000 | 1.0 | 1.000000e-20 | 5.8 | 4.100000 | 5.0 |
1 | 2015.0 | 4.333333 | 2.0 | 3.000000e+00 | 6.0 | 4.000000 | 6.0 |
2 | 2020.0 | 3.000000 | 3.0 | 3.000000e+00 | 5.6 | 4.033333 | 7.0 |
3 | 2025.0 | 9.000000 | 2.0 | 2.000000e+00 | 5.0 | 4.000000 | 50.0 |
Check dtypes are numeric for feeding to sklearn:
#check dtypes
impute_num((df.applymap(bool_to_nan).applymap(s2n).applymap(to_num))).dtypes
year float64
count_1 float64
count_2 float64
count_3 float64
count_4 float64
count_5 float64
count_6 float64
dtype: object
Drop columns with inaccurate values:
#drop columns
impute_num((df.applymap(bool_to_nan).applymap(s2n).applymap(to_num)).dropna(axis=1))
year | count_3 | count_6 | |
---|---|---|---|
0 | 2010.0 | 1.000000e-20 | 5.0 |
1 | 2015.0 | 3.000000e+00 | 6.0 |
2 | 2020.0 | 3.000000e+00 | 7.0 |
3 | 2025.0 | 2.000000e+00 | 50.0 |
Drop rows with inaccurate values:
#drop rows
impute_num((df.applymap(bool_to_nan).applymap(s2n).applymap(to_num)).dropna(axis=0))
year | count_1 | count_2 | count_3 | count_4 | count_5 | count_6 | |
---|---|---|---|---|---|---|---|
0 | 2010.0 | 1.0 | 1.0 | 1.000000e-20 | 5.8 | 4.1 | 5.0 |
1 | 2025.0 | 9.0 | 2.0 | 2.000000e+00 | 5.0 | 4.0 | 50.0 |
Create custom search functions to return positions of funky values.
#value finder
def value_find(df, find):
result = list(zip(*np.where(df.values == find)))
try:
for i in result:
col = df.columns[i[1]]
row = df.index[i[0]]
print(f'{find} at column: {col} row: {row}')
except:
pass
value_find(df, '*')
* at column: count_1 row: 1
Leverage pre-built packages:
ftfy: URL: https://github.com/LuminosoInsight/python-ftfy
Fixes Unicode strings that have been decoded wrong (URL's as %20 instead of space)
Beautifier: URL: https://github.com/labtocat/beautifier
Splits Emails and URL's into copenent parts.
Regular expression operations: Docs: https://docs.python.org/3/library/re.html
Great for sorting/editing specific strings, Practice at: https://regexr.com/