#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from sklearn.impute import KNNImputer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import f_regression


# ### README

# Este código contiene la metodología desarrollada para resolver el problema del proceso de producción del antígeno propuesto en la datathon UNIVERSITYHACK 2024.

# Autores: Ana Porras Garrido y Pedro José Lucas Guillén - Equipo Named17 

# Es necesario definir la variable FOLDER como la ruta en la que se ubican los archivos del reto. Se utilizan todos los archivos proporcionados en sus últimas versiones.

# En requirements se listan la versión de Python y las de los paquetes utilizados.

# #### REQUIREMENTS

# Python 3.12.5

# * anyio==4.6.0
# * argon2-cffi==23.1.0
# * argon2-cffi-bindings==21.2.0
# * arrow==1.3.0
# * asttokens==2.4.1
# * async-lru==2.0.4
# * attrs==24.2.0
# * babel==2.16.0
# * beautifulsoup4==4.12.3
# * bleach==6.1.0
# * certifi==2024.8.30
# * cffi==1.17.1
# * charset-normalizer==3.3.2
# * colorama==0.4.6
# * comm==0.2.2
# * contourpy==1.3.0
# * cycler==0.12.1
# * debugpy==1.8.5
# * decorator==5.1.1
# * defusedxml==0.7.1
# * et-xmlfile==1.1.0
# * executing==2.1.0
# * fastjsonschema==2.20.0
# * fonttools==4.54.0
# * fqdn==1.5.1
# * h11==0.14.0
# * httpcore==1.0.5
# * httpx==0.27.2
# * idna==3.10
# * ipykernel==6.29.5
# * ipython==8.27.0
# * isoduration==20.11.0
# * jedi==0.19.1
# * Jinja2==3.1.4
# * joblib==1.4.2
# * json5==0.9.25
# * jsonpointer==3.0.0
# * jsonschema==4.23.0
# * jsonschema-specifications==2023.12.1
# * jupyter-events==0.10.0
# * jupyter-lsp==2.2.5
# * jupyter_client==8.6.3
# * jupyter_core==5.7.2
# * jupyter_server==2.14.2
# * jupyter_server_terminals==0.5.3
# * jupyterlab==4.2.5
# * jupyterlab_pygments==0.3.0
# * jupyterlab_server==2.27.3
# * kiwisolver==1.4.7
# * lightgbm==4.5.0
# * MarkupSafe==2.1.5
# * matplotlib==3.9.2
# * matplotlib-inline==0.1.7
# * mistune==3.0.2
# * nbclient==0.10.0
# * nbconvert==7.16.4
# * nbformat==5.10.4
# * nest-asyncio==1.6.0
# * notebook_shim==0.2.4
# * numpy==2.1.1
# * openpyxl==3.1.5
# * overrides==7.7.0
# * packaging==24.1
# * pandas==2.2.3
# * pandocfilters==1.5.1
# * parso==0.8.4
# * patsy==0.5.6
# * pillow==10.4.0
# * platformdirs==4.3.6
# * prometheus_client==0.21.0
# * prompt_toolkit==3.0.47
# * psutil==6.0.0
# * pure_eval==0.2.3
# * pycparser==2.22
# * Pygments==2.18.0
# * pyparsing==3.1.4
# * python-dateutil==2.9.0.post0
# * python-json-logger==2.0.7
# * pytz==2024.2
# * pywin32==306
# * pywinpty==2.0.13
# * PyYAML==6.0.2
# * pyzmq==26.2.0
# * referencing==0.35.1
# * requests==2.32.3
# * rfc3339-validator==0.1.4
# * rfc3986-validator==0.1.1
# * rpds-py==0.20.0
# * scikit-learn==1.5.2
# * scipy==1.14.1
# * seaborn==0.13.2
# * Send2Trash==1.8.3
# * setuptools==75.1.0
# * six==1.16.0
# * sniffio==1.3.1
# * soupsieve==2.6
# * stack-data==0.6.3
# * statsmodels==0.14.4
# * terminado==0.18.1
# * threadpoolctl==3.5.0
# * tinycss2==1.3.0
# * tornado==6.4.1
# * tqdm==4.66.5
# * traitlets==5.14.3
# * types-python-dateutil==2.9.0.20240906
# * tzdata==2024.2
# * uri-template==1.3.0
# * urllib3==2.2.3
# * wcwidth==0.2.13
# * webcolors==24.8.0
# * webencodings==0.5.1
# * websocket-client==1.8.0
# * wheel==0.44.0
# * xgboost==2.1.1

# ### PARAMETERS

# In[2]:


FOLDER = '20240927 Reto UniversityHack 2024/'
FILES = {
    'of': ['OF 123456 v03.xlsx'],
    'producción': ['Fases producción v03.xlsx'],
    'biorreactor': ['Biorreactor 13169.xlsx', 'Biorreactor 13170.xlsx', 'Biorreactor 13171.xlsx', 
                    'Biorreactor 13172.xlsx', 'Biorreactor 14614.xlsx', 'Biorreactor 14615.xlsx', 
                    'Biorreactor 14616.xlsx', 'Biorreactor 14617.xlsx', 'Biorreactor 14618.xlsx'],
    'cinéticos': ['Cinéticos IPC.xlsx'],
    'centrífuga': ['Centrífuga 12912.xlsx', 'Centrífuga 14246.xlsx', 'Centrífuga 17825.xlsx'],
    'centrífuga_horas': ['Horas inicio fin centrífugas.xlsx'],
    'componentes': ['Movimientos componentes.xlsx'],
    'ambientales': ['Temperaturas y humedades.xlsx'],
    'test': ['Fases producción v03 Test.xlsx'],
}

biorreactores_id = [int(x.replace('Biorreactor ', '').replace('.xlsx', '')) for x in FILES['biorreactor']]
centrifugas_id = [int(x.replace('Centrífuga ', '').replace('.xlsx', '')) for x in FILES['centrífuga']]


# ### FUNCTIONS

# In[3]:


def preprocess(var, drop_unique=True):
    var = var.drop_duplicates()
    if drop_unique:
        to_drop = var.columns[var.nunique() == 1]
        var = var.drop(to_drop, axis=1)
    parse_dates = var.dtypes[var.dtypes == 'datetime64[ns]'].index
    # fix number formats
    var = var.replace({float('nan'): None, 'N.A': None, 'NA': None, 'na': None, '-': None, 'LO': None, 'NR': None})
    for col in var.columns:
        if 'mixed' in pd.api.types.infer_dtype(var[col]):
            nona = ~var[col].isna()
            try: 
                var.loc[nona, col] = var.loc[nona, col].astype(str).str.replace(',', '.').str.replace('+', '').astype(float)
            except:
                pass
    for col in var.columns:
        if 'mixed' in pd.api.types.infer_dtype(var[col]):
            nona = ~var[col].isna()
            try: 
                var.loc[nona, col] = var.loc[nona, col].astype(str).str.replace(',', '.').str.split('/').str[0].astype(float)
            except:
                pass
    for col in parse_dates:
        var[col] = pd.to_datetime(var[col])
    return var


# In[4]:


def biorreactor_lotes(var, key):
    return var[var['ID Bioreactor'] == key].sort_values(by='Fecha/hora inicio')

def buscar_lote_previo(var, lote):
    b = biorreactor_lotes(var, var.loc[lote, 'ID Bioreactor'])
    b['lote_previo'] = b.index
    b['lote_previo'] = b['lote_previo'].shift(1)
    return b.loc[lote, 'lote_previo']

def check_peso(fin_parental, ini_has_parental, bio_id):
    b = biorreactores[bio_id]
    b = b[(b.index > fin_parental) & (b.index < ini_has_parental)]
    b = b['Load_Cell_Net_PV']
    b = b.clip(lower=0)
    peso_minimo = b.min()
    if len(b) > 0:
        peso_reduccion = peso_minimo / max(b.loc[:b.idxmin()].max(), 1) - 1
    else:
        # suponemos que el primero es independiente
        peso_reduccion = 0
    return peso_minimo, peso_reduccion

def check_dependencias(var):
    in_of = var['LOTE'].isin(of.index)
    in_preinoculo = var['LOTE'].isin(preinoculo.index)
    in_inoculo = var['LOTE'].isin(inoculo.index)
    is_parental = var['LOTE'].isin(var['LOTE parental'].dropna().to_list())
    is_child = ~var['LOTE parental'].isna()
    
    check = pd.concat([in_of, is_parental, ~is_child, in_preinoculo, in_inoculo], axis=1, keys=['in_of', 'is_parental', '~is_child', 'in_preinoculo', 'in_inoculo'])
    return check

def check_parental_previo(var, aux=None):
    if aux is None:
        aux = var.copy()
        
    cols = ['ID Bioreactor', 'Orden en el encadenado', 'Fecha/hora inicio', 'Fecha/hora fin']
    parental = var[cols + ['LOTE parental']].merge(var[cols], how='left', left_on='LOTE parental', right_index=True, suffixes=('', '_parental'))
    parental['lag'] = parental['Fecha/hora inicio'] - parental['Fecha/hora fin_parental']
    peso = {lote: check_peso(parental.loc[lote, 'Fecha/hora fin_parental'], parental.loc[lote, 'Fecha/hora inicio'], parental.loc[lote, 'ID Bioreactor']) for lote in var.index if var.loc[lote, 'LOTE parental']}
    peso = pd.DataFrame.from_dict(peso, orient='index', columns=['peso_minimo', 'peso_reduccion'])
    peso = peso.reindex(var.index)
    parental = parental.join(peso)
    peso_p = {lote: check_peso(parental.loc[lote, 'Fecha/hora fin_parental'], parental.loc[lote, 'Fecha/hora inicio'], parental.loc[lote, 'ID Bioreactor_parental']) for lote in var.index if var.loc[lote, 'LOTE parental']}
    peso_p = pd.DataFrame.from_dict(peso_p, orient='index', columns=['peso_minimo', 'peso_reduccion'])
    peso_p = peso_p.reindex(var.index)
    parental = parental.join(peso_p, lsuffix='_bio', rsuffix='_bio_parental')
    
    previo = {lote: buscar_lote_previo(aux, lote) for lote in var.index}
    var['LOTE previo'] = pd.Series(previo)
    previo = var[cols + ['LOTE previo']].merge(aux[cols], how='left', left_on='LOTE previo', right_index=True, suffixes=('', '_previo'))
    previo['lag'] = previo['Fecha/hora inicio'] - previo['Fecha/hora fin_previo']
    peso_pr = {lote: check_peso(previo.loc[lote, 'Fecha/hora fin_previo'], previo.loc[lote, 'Fecha/hora inicio'], previo.loc[lote, 'ID Bioreactor']) for lote in var.index if var.loc[lote, 'LOTE previo']}
    peso_pr = pd.DataFrame.from_dict(peso_pr, orient='index', columns=['peso_minimo', 'peso_reduccion'])
    peso_pr = peso_pr.reindex(var.index)
    previo = previo.join(peso_pr)

    return parental, previo

def check_centrifuga_id(centrifuga_horas, var, centrifuga_cin):
    id_horas = centrifuga_horas.dropna(subset=['LOTE']).set_index('LOTE')['ID Centrífuga']
    id_cultivo = var['ID Centrífuga']
    id_cinetico = centrifuga_cin.reset_index('ID Centrífuga')['ID Centrífuga']
    
    id_c = [id_cultivo, id_horas, id_cinetico]
    id_c = pd.concat(id_c, axis=1, keys=['cultivo', 'horas', 'cinetico']).loc[var.index]
    return id_c


# In[5]:


def agg_cineticos(cineticos, group_by=['LOTE']):
    first = cineticos.groupby(group_by).first(skipna=False)
    last = cineticos.groupby(group_by).last(skipna=False)
    mean = cineticos.groupby(group_by).mean(numeric_only=True) 
    # quito std porque el número de datos en test es en general menor y eso puede hacerlo no comparable
    # podría sesgar el modelo
    # std = cineticos.groupby(group_by).std(numeric_only=True) 
    var = np.log(last[mean.columns] / first[mean.columns])
    var[first[mean.columns] == 0] = float('nan')
    count = cineticos.reset_index().groupby(group_by).count()['index'].rename('count')
    mean[count == 1] = float('nan')
    first[count == 1] = float('nan')
    agg = first.join(last, lsuffix='_ini', rsuffix='_fin')
    agg = agg.join(mean).join(var, rsuffix='_var')
    agg = agg.join(count)
    return agg

def agg_temporal(data, lote, id_, fecha_ini, fecha_fin):
    b = data[id_].loc[fecha_ini:fecha_fin]
    b = b.astype(float)
    r = b.diff()
    agg = [b.mean(), b.std(), r.mean(), r.std()] # median, min, max
    agg = [pd.DataFrame(x.rename(lote)).T for x in agg]
    agg = agg[0].join(agg[1], rsuffix='_std').join(agg[2], rsuffix='_ret').join(agg[3], rsuffix='_ret_std')
    return agg


# In[6]:


# imputación de nans
def impute_nan_knn(features, f, t, n_neighbors=4):
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    fz = features.drop(t)
    z = (features[f] - fz[f].mean()) / features[f].std()
    knn_imputer.fit(z.drop(t));
    imputed = knn_imputer.transform(z);
    imputed = pd.DataFrame(imputed, index=features.index, columns=f)
    imputed = imputed * fz[f].std() + fz[f].mean()
    for col in f:
        features[col] = features[col].fillna(imputed[col])
    
    return features

def impute_nan(feats, impute_kind, t):    
    imputed_features = feats.copy()    
    if impute_kind == 'KNN':
        fases = set([x.split('_')[0] for x in feats.columns])
        for fase in fases:
            f = [x for x in feats.columns if x.split('_')[0] == fase]
            imputed_features = impute_nan_knn_2(imputed_features, f, t)
    else:
        imputer = SimpleImputer(strategy=impute_kind.lower())
        imputer.fit(feats.drop(t));
        imputed_features = imputer.transform(feats)
        imputed_features = pd.DataFrame(imputed_features, index=feats.index, columns=feats.columns)
    return imputed_features


# In[7]:


def select_by_correlation(X, y, limit=None):
    selection_corr = X.corrwith(y).abs().sort_values(ascending=False)

    if limit is not None:
        # selección por correalción sin redundancia
        redundancy = X.corr().melt(ignore_index=False, value_name='corr')
        redundancy = redundancy.reset_index()
        redundancy['corr.abs'] = redundancy['corr'].abs()
        selection = []
        redundant = []
        while selection_corr.drop(selection + redundant).shape[0] > 0:
            sel = selection_corr.drop(selection + redundant).index[0]
            selection.append(sel)
            redundant += redundancy[(redundancy['index'] == sel) & (redundancy['corr.abs'] > limit)]['variable'].to_list()
    else:
        selection = selection_corr
        
    return selection

def select_by_pvalues(X, y):
    selection_pvalues = pd.Series(0.0, index=X.columns)
    for col in X.columns:
        block = pd.concat([X[col], y], axis=1).dropna()
        _, p = f_regression(block[[col]], block[y.name])
        selection_pvalues.loc[col] = p[0]
    selection_pvalues = selection_pvalues.sort_values()
    return selection_pvalues

def select_by_pvalues_multi(X, y, threshold):
    order = X.corrwith(y).abs().sort_values(ascending=False).index
    selection_pvalues_multi = pd.Index([])
    for col in order:
        block = pd.concat([X[selection_pvalues_multi.union([col])], y], axis=1).dropna()
        yy = block[y.name]
        block = block.drop(y.name, axis=1)
        block = add_constant(block)
        model = OLS(yy, block).fit()
        if model.pvalues.loc[col] < threshold:
            #selection_pvalues_multi = selection_pvalues_multi.union(model.pvalues.drop('const').loc[[col]].index)
            selection_pvalues_multi = selection_pvalues_multi.union(model.pvalues.loc[[col]].index)
    return selection_pvalues_multi

def select_by_pvalues_multi_redundancy_drop(X, y, threshold):
    order = X.corrwith(y).abs().sort_values(ascending=False).index
    selection_pvalues_multi_redundancy_drop = pd.Index([])
    for col in order:
        block = pd.concat([X[selection_pvalues_multi_redundancy_drop.union([col])], y], axis=1).dropna()
        yy = block[y.name]
        block = block.drop(y.name, axis=1)
        block = add_constant(block)
        model = OLS(yy, block).fit()
        model.pvalues
        if model.pvalues.loc[col] < threshold:
            selection_pvalues_multi_redundancy_drop = selection_pvalues_multi_redundancy_drop.union(model.pvalues.drop('const').loc[[col]].index)
            
            while model.pvalues.drop('const').max() >= threshold:
            
                to_drop = model.pvalues.drop('const').idxmax()
                selection_pvalues_multi_redundancy_drop = selection_pvalues_multi_redundancy_drop.drop(to_drop)
                block = pd.concat([X[selection_pvalues_multi_redundancy_drop], y], axis=1).dropna()
                yy = block[y.name]
                block = block.drop(y.name, axis=1)
                block = add_constant(block)        
                model = OLS(yy, block).fit()
                
    return selection_pvalues_multi_redundancy_drop

def select_by_pvalues_multi_continue(X, y, threshold, selection_pvalues_multi, to_try):
    order = X.corrwith(y).abs().sort_values(ascending=False).index
    order = [x for x in order if x in to_try]
    for col in order:
        block = pd.concat([X[selection_pvalues_multi.union([col])], y], axis=1).dropna()
        yy = block[y.name]
        block = block.drop(y.name, axis=1)
        block = add_constant(block)
        model = OLS(yy, block).fit()
        model.pvalues
        if model.pvalues.loc[col] < threshold:
            selection_pvalues_multi = selection_pvalues_multi.union(model.pvalues.drop('const').loc[[col]].index)
        
    return selection_pvalues_multi


# ### READ

# ##### of

# In[8]:


of = pd.read_excel(FOLDER + FILES['of'][0], sheet_name=0)
of = preprocess(of)
of['Lote'] = of['Lote'].str.replace('/', '')
of = of.rename({'Lote': 'LOTE'}, axis=1)
of = of.set_index('LOTE')


# ##### fases

# In[9]:


preinoculo = pd.read_excel(FOLDER + FILES['producción'][0], sheet_name='Preinóculo', header=[0, 1])

preinoculo.columns = ['_'.join([x, y]) if 'Unnamed' not in x else y for x, y in preinoculo.columns]
date_cols = ['Fecha/hora inicio', 'Fecha/hora fin']
for col in date_cols:
    preinoculo[col] = pd.to_datetime(preinoculo[col])
preinoculo = preprocess(preinoculo)
preinoculo['LOTE'] = preinoculo['LOTE'].astype(str)
preinoculo = preinoculo.set_index('LOTE')

# preinoculo usado
ph = [x for x in preinoculo.columns if 'pH' in x]
turbidez = [x for x in preinoculo.columns if 'Turbidez' in x]
uso = [x for x in preinoculo.columns if 'Línea utilizada' in x]
preinoculo[uso].sum(axis=1).value_counts()

rename = {x: y for x, y in zip(ph, uso)}
ph_uso = preinoculo[ph].rename(rename, axis=1)[preinoculo[uso] == 1].mean(axis=1)
rename = {x: y for x, y in zip(turbidez, uso)}
turbidez_uso = preinoculo[turbidez].rename(rename, axis=1)[preinoculo[uso] == 1].mean(axis=1)
uso_uso = preinoculo[uso].sum(axis=1)

preinoculo_uso = pd.concat([ph_uso, turbidez_uso, uso_uso], axis=1, keys=['pH_utilizada', 'Turbidez_utilizada', 'Líneas_utilizadas'])
preinoculo = preinoculo.join(preinoculo_uso)
preinoculo = preinoculo.drop(ph + turbidez + uso, axis=1) # borramos los datos por línea


# In[10]:


preinoculo.duplicated().sum() / preinoculo.shape[0]


# In[11]:


inoculo = pd.read_excel(FOLDER + FILES['producción'][0], sheet_name='Inóculo', parse_dates=['Fecha/hora inicio', 'Fecha/hora fin'])
inoculo = preprocess(inoculo)
inoculo['LOTE'] = inoculo['LOTE'].astype(str)
inoculo = inoculo.set_index('LOTE')
inoculo['ID bioreactor'] = inoculo['ID bioreactor'].astype(float).round(0).astype(int)


# In[12]:


inoculo.duplicated().sum() / inoculo.shape[0]


# In[13]:


cultivo = pd.read_excel(FOLDER + FILES['producción'][0], sheet_name='Cultivo final', parse_dates=['Fecha/hora inicio', 'Fecha/hora fin'])
cultivo = preprocess(cultivo)
cultivo['LOTE'] = cultivo['LOTE'].astype(float).round(0).astype(int).astype(str)
nona = ~cultivo['LOTE parental'].isna()
cultivo.loc[nona, 'LOTE parental'] = cultivo.loc[nona, 'LOTE parental'].astype(float).round(0).astype(int).astype(str)
cultivo = cultivo.set_index('LOTE')
cultivo['ID Bioreactor'] = cultivo['ID Bioreactor'].round(0).astype(int)
cultivo['ID Centrífuga'] = cultivo['ID Centrífuga'].round(0).astype(int)


# In[14]:


test = pd.read_excel(FOLDER + FILES['test'][0], sheet_name='Cultivo final', parse_dates=['Fecha/hora inicio', 'Fecha/hora fin'])#, dtype={'ID Centrífuga': float})
test = preprocess(test)
test['LOTE'] = test['LOTE'].astype(str)
test.loc[~test['LOTE parental'].isna(), 'LOTE parental'] = test.loc[~test['LOTE parental'].isna(), 'LOTE parental'].round(0).astype(int).astype(str)
test = test.set_index('LOTE')
test['ID Bioreactor'] = test['ID Bioreactor'].astype(float).round(0).astype(int)
test['ID Centrífuga'] = test['ID Centrífuga'].astype(float).round(0).astype(int)


# ##### ambientales

# In[15]:


ambientales = pd.read_excel(FOLDER + FILES['ambientales'][0], sheet_name='Datos', parse_dates=['DateTime'])
nuevos_nombres ={'DateTime':'Fecha', '06299_TI1302.PV':'T_bios', '06299_MI1302.PV':'H_bios', '06299_TI1402.PV':'T_centri', '06299_MI1402.PV':'H_centri', 
                '07633_TI0601.PV':'T_alma_princ', '07633_HI0101.PV':'H_alma_princ', '07781_TI1501.PV':'T_alma_prod', '07781_MI1501.PV':'H_alma_prod'}
ambientales = ambientales.rename(columns=nuevos_nombres)
ambientales = ambientales.set_index('Fecha')


# ##### biorreactores

# In[16]:


biorreactores = {}
for file, bio_id in zip(FILES['biorreactor'], biorreactores_id):
    aux = pd.read_excel(FOLDER + file, sheet_name='Datos', header=[0], parse_dates=['DateTime'])
    aux = preprocess(aux, drop_unique=False)
    aux.columns = [x.replace(str(bio_id) + '_', '').replace('FERM0101.', '') for x in aux.columns]
    aux = aux.set_index('DateTime').sort_index()
    aux['DO_PV'] = aux[['DO_1_PV', 'DO_2_PV']].clip(lower=0).max(axis=1)
    aux['pH_PV'] = aux[['pH_1_PV', 'pH_2_PV']].clip(lower=0).max(axis=1)
    aux = aux.drop(['DO_1_PV', 'DO_2_PV', 'pH_1_PV', 'pH_2_PV'], axis=1)
    biorreactores[bio_id] = aux


# In[17]:


# comprobamos que todas las columnas de los biorreatores coinciden
coin = pd.DataFrame(True, index=biorreactores_id, columns=biorreactores_id)
for i in biorreactores_id:
    for j in biorreactores_id:
        if i > j:
            c = set(biorreactores[i].columns.to_list()) == set(biorreactores[j].columns.to_list())
            coin.loc[i, j] = c
            coin.loc[j, i] = c
coin.all().all()


# In[18]:


bio = [inoculo['ID bioreactor'].value_counts(), cultivo['ID Bioreactor'].value_counts(), test['ID Bioreactor'].value_counts()]
bio = pd.concat(bio, axis=1, keys=['inoculo', 'cultivo', 'test'])
bio['isin_biorreactores'] = bio.index.isin(biorreactores_id)
bio


# In[19]:


# suponemos que es un error tipográfico
wrong_biorreactor = cultivo[cultivo['ID Bioreactor'] == 13189].index
cultivo.loc[wrong_biorreactor, 'ID Bioreactor'] = 13169


# In[20]:


bio = [inoculo['ID bioreactor'].value_counts(), cultivo['ID Bioreactor'].value_counts(), test['ID Bioreactor'].value_counts()]
bio = pd.concat(bio, axis=1, keys=['inoculo', 'cultivo', 'test'])
bio['isin_biorreactores'] = bio.index.isin(biorreactores_id)
bio


# ##### centrifugas

# In[21]:


centri = [cultivo['ID Centrífuga'].value_counts(), test['ID Centrífuga'].value_counts()]
centri = pd.concat(centri, axis=1, keys=['cultivo', 'test']).fillna(0)
centri['isin_centrífugas'] = centri.index.isin(centrifugas_id)
centri


# In[22]:


centrifuga_horas = pd.read_excel(FOLDER + FILES['centrífuga_horas'][0], sheet_name='Hoja1', parse_dates=['DATEVALUE'])
centrifuga_horas = preprocess(centrifuga_horas)
centrifuga_horas['EQUIPO'] = centrifuga_horas['EQUIPO'].astype(float).round(0).astype(int)
centrifuga_horas = centrifuga_horas.rename({'EQUIPO': 'ID Centrífuga'}, axis=1)

centrifuga_horas['N'] = centrifuga_horas['Operación'].str.replace('Centrifugació ', '').str.replace(' ini', '').str.replace(' fi', '').astype(int)
centrifuga_horas = centrifuga_horas.sort_values('DATEVALUE').drop(['Operación'], axis=1)

centrifuga_horas_1 = agg_cineticos(centrifuga_horas[centrifuga_horas['N'] == 1].drop('N', axis=1), ['Orden', 'ID Centrífuga'])
centrifuga_horas_2 = agg_cineticos(centrifuga_horas[centrifuga_horas['N'] == 2].drop('N', axis=1), ['Orden', 'ID Centrífuga'])
centrifuga_horas = centrifuga_horas_1.join(centrifuga_horas_2, lsuffix='_1', rsuffix='_2')
centrifuga_horas = centrifuga_horas.drop(['count_1', 'count_2'], axis=1).reset_index()

centrifuga_horas = centrifuga_horas.merge(of.reset_index()[['LOTE', 'Orden']], how='left', on='Orden')


# In[23]:


centrifugas = {}
for file, centri_id in zip(FILES['centrífuga'], centrifugas_id):
    aux = pd.read_excel(FOLDER + file, sheet_name='Datos', header=[0], parse_dates=['DateTime'])
    aux = preprocess(aux, drop_unique=False)
    aux.columns = [x.replace(str(centri_id) + '_', '') for x in aux.columns]
    aux = aux.set_index('DateTime').sort_index()
    centrifugas[centri_id] = aux


# In[24]:


# comprobamos que todas las columnas de las centrífugas coinciden
coin = pd.DataFrame(True, index=centrifugas_id, columns=centrifugas_id)
for i in centrifugas_id:
    for j in centrifugas_id:
        if i > j:
            c = set(centrifugas[i].columns.to_list()) == set(centrifugas[j].columns.to_list())
            coin.loc[i, j] = c
            coin.loc[j, i] = c
coin.all().all()


# ##### cinéticos

# In[25]:


inoculo_cin = pd.read_excel(FOLDER + FILES['cinéticos'][0], sheet_name='Inóculos', parse_dates=['Fecha'])
inoculo_cin = preprocess(inoculo_cin)
inoculo_cin = inoculo_cin.rename({'Lote': 'LOTE'}, axis=1)
inoculo_cin['LOTE'] = inoculo_cin['LOTE'].astype(str)
float_cols = ['Turbidez', 'Viabilidad']
inoculo_cin[float_cols] = inoculo_cin[float_cols].astype(float)


# In[26]:


cultivo_cin = pd.read_excel(FOLDER + FILES['cinéticos'][0], sheet_name='Cultivos finales', parse_dates=['Fecha'])
cultivo_cin = preprocess(cultivo_cin)
cultivo_cin = cultivo_cin.rename({'Lote': 'LOTE'}, axis=1)
cultivo_cin['LOTE'] = cultivo_cin['LOTE'].astype(str)
float_cols = ['Turbidez', 'Viabilidad', 'Glucosa g/L']
cultivo_cin[float_cols] = cultivo_cin[float_cols].astype(float)


# In[27]:


centrifuga_cin = pd.read_excel(FOLDER + FILES['cinéticos'][0], sheet_name='Centrifugación')
centrifuga_cin = preprocess(centrifuga_cin)
centrifuga_cin = centrifuga_cin.rename({'Lote': 'LOTE', 'Centrífuga': 'ID Centrífuga'}, axis=1)
centrifuga_cin['LOTE'] = centrifuga_cin['LOTE'].astype(str)
centrifuga_cin = centrifuga_cin.rename({'Centrifugada (1 o 2)': 'N'}, axis=1)
centrifuga_cin['Turbidez'] = centrifuga_cin['Turbidez'].astype(float)
centrifuga_cin = centrifuga_cin.drop('Volumen centrifugado (L)', axis=1) # quitamos el volumen


# In[28]:


inoculo_cin = agg_cineticos(inoculo_cin)
cultivo_cin = agg_cineticos(cultivo_cin)


# In[29]:


centrifuga_cin_1 = agg_cineticos(centrifuga_cin[centrifuga_cin['N'] == 1].drop('N', axis=1), ['LOTE', 'ID Centrífuga'])
centrifuga_cin_2 = agg_cineticos(centrifuga_cin[centrifuga_cin['N'] == 2].drop('N', axis=1), ['LOTE', 'ID Centrífuga'])
centrifuga_cin = centrifuga_cin_1.join(centrifuga_cin_2, lsuffix='_1', rsuffix='_2') 


# In[30]:


# pequeño error en el lote 23040
duplicated = centrifuga_cin.reset_index(['LOTE'])['LOTE']
duplicated = duplicated[duplicated.duplicated()]
centrifuga_cin.loc[duplicated]


# In[31]:


centrifuga_horas[centrifuga_horas['LOTE'].isin(duplicated)]


# In[32]:


to_remove = [i for i in centrifuga_cin.index if i[0] in duplicated.values and i[1] not in centrifuga_horas[centrifuga_horas['LOTE'].isin(duplicated)]['ID Centrífuga'].values]
centrifuga_cin = centrifuga_cin.drop(to_remove)
to_remove


# ##### materiales

# In[33]:


material = pd.read_excel(FOLDER + FILES['componentes'][0], sheet_name='Full1', parse_dates=['Fecha recepción', 'Fecha traslado'])

material = material.rename({'Lote': 'LOTE'}, axis=1)
material['LOTE'] = material['LOTE'].astype(str)


# ### PRODUCTO 1

# Algunos datos de train no tienen la variable input. Comprobamos que no son parentales y los eliminamos.

# In[34]:


missing_producto = cultivo[cultivo['Producto 1'].isna()].index
cultivo.loc[missing_producto]


# In[35]:


cultivo[cultivo['LOTE parental'].isin(missing_producto)]


# In[36]:


cultivo = cultivo.drop(missing_producto)


# ### LOTES INDEPENDIENTES Y DEPENDIENTES

# In[37]:


var = pd.concat([cultivo, test])
var['LOTE'] = var.index
var['TEST'] = var['LOTE'].isin(test.index)


# In[38]:


check = check_dependencias(var)
parental, previo = check_parental_previo(var)


# #### MISSING OF

# Ahora hay muchos que no están en la of (todos de train). En principio, los eliminaría, pero hay varios que son hijos y su parental sí está en la of y es raro.
# Ninguno de los nuevos lotes añadidos a cultivo están en la of, parece un error: ['23052', '23066', '23074', '24002', '24026', '24029', '24034', '24039', '24048'].

# In[39]:


check[~check['in_of']].join(var['TEST'])


# In[40]:


check.loc[cultivo.loc['24029', 'LOTE parental']]


# In[41]:


# eliminamos las que no están en la of
missing_of = check[~check['in_of']].index
var = var.drop(missing_of)

check = check_dependencias(var)
parental, previo = check_parental_previo(var)


# ##### búsqueda de of

# In[42]:


# missing_of = check[~check['in_of']].index
# 
# # el id centrífuga coincide en cultivo y cinéticos
# a = var.loc[missing_of, 'ID Centrífuga']
# b = centrifuga_cin.loc[missing_of].reset_index('ID Centrífuga')['ID Centrífuga']
# ((a - b) == 0).all()
# 
# missing_of = var.loc[missing_of, ['ID Centrífuga', 'Fecha/hora fin']]
# 
# for k in missing_of.index:
#     aux = centrifuga_horas[(centrifuga_horas['ID Centrífuga'] == missing_of.loc[k, 'ID Centrífuga']) & (centrifuga_horas['DATEVALUE_ini_1'].dt.date == missing_of.loc[k, 'Fecha/hora fin'].date())].sort_values('DATEVALUE_ini_1')
#     if aux.shape[0] == 1 and aux['LOTE'].isna().all():
#         print(k, 'ok')
#         # añadimos el lote en centrífuga_horas
#         centrifuga_horas.loc[aux.index[0], 'LOTE'] = k
#         # añadimos la of aunque no tenemos la cantidad entregada
#         to_add = [k, aux.loc[aux.index[0], 'Orden'], float('nan')]
#         to_add = pd.DataFrame([to_add], columns=of.reset_index().columns).set_index('LOTE')
#         of = pd.concat([of, to_add])
#     else:
#         print(k, 'no')
# 
# # comprobación de los cambios
# check = check_dependencias(var)
# parental, previo = check_parental_previo(var)
# check[['in_of', '~is_child', 'in_preinoculo', 'in_inoculo']].value_counts()


# #### INDEPENDIENTES

# - No tienen parental
# - Tienen preinóculo e inóculo
# - El orden es 1
# - El lag con el previo es largo
# - Se vacía el biorreactor

# In[43]:


independent = check[check[['~is_child', 'in_preinoculo', 'in_inoculo']].all(axis=1)].index


# In[44]:


var.loc[independent, 'Orden en el encadenado'].value_counts()


# In[45]:


previo.loc[independent, ['Orden en el encadenado', 'lag', 'peso_minimo', 'peso_reduccion']].describe()


# #### DEPENDIENTES

# - Tiene parental
# - No tiene preinóculo ni inóculo
# - El orden es el del parental + 1
# - El lag con el parental es corto
# - No se vacía el biorreactor
# - El lote previo coincide con el parental
# - El biorreactor coincide con el del parental

# In[46]:


dependent = check[(check[['~is_child', 'in_preinoculo', 'in_inoculo']] == False).all(axis=1)].index


# In[47]:


(previo.loc[dependent, 'LOTE previo'] == parental.loc[dependent, 'LOTE parental']).all()


# In[48]:


wrong_previo = previo.loc[dependent][previo.loc[dependent, 'LOTE previo'] != parental.loc[dependent, 'LOTE parental']].index
previo.loc[wrong_previo]


# In[49]:


parental.loc[wrong_previo]


# In[50]:


idx = bio['cultivo'].dropna().index


# In[51]:


fig, axs = plt.subplots(2, 3, figsize=(15, 6), tight_layout=True)
axs = axs.ravel()
for i, ax in zip(idx, axs):
    biorreactores[i].loc['2024-01-15':'2024-01-30']['Load_Cell_Net_PV'].plot(grid=True, ax=ax, title=int(i))


# In[52]:


fig, axs = plt.subplots(2, 3, figsize=(15, 6), tight_layout=True)
axs = axs.ravel()
for i, ax in zip(idx, axs):
    biorreactores[i].loc['2024-02-01':'2024-02-15']['Load_Cell_Net_PV'].plot(grid=True, ax=ax, title=int(i))


# In[53]:


# los lotes 24011 y 24015 no encajan con ningún biorreactor ni parental. Los eliminamos
to_remove = ['24011', '24015']
var = var.drop(to_remove)


# In[54]:


# el lote 24017 encaja en el biorreactor de su parental
var.loc['24017', 'ID Bioreactor'] = var.loc[cultivo.loc['24017', 'LOTE parental'], 'ID Bioreactor']


# In[55]:


check = check_dependencias(var)
parental, previo = check_parental_previo(var)
dependent = check[(check[['~is_child', 'in_preinoculo', 'in_inoculo']] == False).all(axis=1)].index


# In[56]:


(previo.loc[dependent, 'LOTE previo'] == parental.loc[dependent, 'LOTE parental']).all()


# In[57]:


(previo.loc[dependent, 'Orden en el encadenado'] == (previo.loc[dependent, 'Orden en el encadenado_previo'] + 1)).all()


# In[58]:


previo.loc[dependent, ['Orden en el encadenado', 'lag', 'peso_minimo', 'peso_reduccion']].describe()


# #### PREINÓCULO E INÓCULO DEPENDIENTES

# In[59]:


check = check_dependencias(var)
parental, previo = check_parental_previo(var)
check[['in_of', '~is_child', 'in_preinoculo', 'in_inoculo']].value_counts()


# In[60]:


independent = check[check[['~is_child', 'in_preinoculo', 'in_inoculo']].all(axis=1)].index
dependent = check[(check[['~is_child', 'in_preinoculo', 'in_inoculo']] == False).all(axis=1)].index


# In[61]:


len(independent), len(dependent)


# In[62]:


# enganche de los parentales: añadimos nuevos datos de preinóculo e inóculo al conjunto inicial
orden_2 = var.loc[var.index.isin(dependent) & (var['Orden en el encadenado'] == 2)].index
rename = var.loc[orden_2, 'LOTE parental'].reset_index().set_index('LOTE parental')['LOTE']
preinoculo_ = preinoculo.loc[rename.index].rename(rename)
preinoculo = pd.concat([preinoculo, preinoculo_])
inoculo_ = inoculo.loc[rename.index].rename(rename)
inoculo = pd.concat([inoculo, inoculo_])
inoculo_cin_ = inoculo_cin.loc[rename.index].rename(rename)
inoculo_cin = pd.concat([inoculo_cin, inoculo_cin_])

orden_3 = var.loc[var.index.isin(dependent) & (var['Orden en el encadenado'] == 3)].index
rename = var.loc[orden_3, 'LOTE parental'].reset_index().set_index('LOTE parental')['LOTE']
preinoculo_ = preinoculo.loc[rename.index].rename(rename)
preinoculo = pd.concat([preinoculo, preinoculo_])
inoculo_ = inoculo.loc[rename.index].rename(rename)
inoculo = pd.concat([inoculo, inoculo_])
inoculo_cin_ = inoculo_cin.loc[rename.index].rename(rename)
inoculo_cin = pd.concat([inoculo_cin, inoculo_cin_])


# ### BIORREACTORES

# In[63]:


bio_inoculo = [agg_temporal(biorreactores, lote, inoculo.loc[lote, 'ID bioreactor'], 
                            inoculo.loc[lote, 'Fecha/hora inicio'], inoculo.loc[lote, 'Fecha/hora fin']) 
               for lote in var.index if inoculo.loc[lote, 'ID bioreactor'] in biorreactores_id]
bio_inoculo = pd.concat(bio_inoculo)


# In[64]:


bio_cultivo = [agg_temporal(biorreactores, lote, var.loc[lote, 'ID Bioreactor'], 
                            var.loc[lote, 'Fecha/hora inicio'], var.loc[lote, 'Fecha/hora fin']) 
               for lote in var.index if var.loc[lote, 'ID Bioreactor'] in biorreactores_id]
bio_cultivo = pd.concat(bio_cultivo)


# ### CENTRÍFUGAS

# #### centrífuga id

# In[65]:


id_c = check_centrifuga_id(centrifuga_horas, var, centrifuga_cin)


# In[66]:


# 6379: esta centrífuga no está entre los datos (son de train). he comprobado si alguna de las conocidas tenía un hueco en las horas que les corresponderían a estos lotes, pero no.
# los otros no están en las of y por eso no se pueden unir con centrífugas, salvo que busquemos los huecos.
missing_centrifuga = id_c[id_c.isna().sum(axis=1) > 0].index
id_c.loc[missing_centrifuga]


# In[67]:


# centrifuga id no coincidente entre los distintos conjuntos
err = id_c[id_c.diff(axis=1).fillna(0).abs().sum(axis=1) > 0].index
len(err)


# In[68]:


id_c.loc[err].join(id_c.loc[err].isin(centrifugas_id), rsuffix='_isin')


# In[69]:


# modificamos los valores que parecen errores tipográficos (14246)
replace = {14247: 14246, 14146: 14246}
centrifuga_cin = centrifuga_cin.reset_index()
centrifuga_cin['ID Centrífuga'] = centrifuga_cin['ID Centrífuga'].replace(replace)
centrifuga_cin = centrifuga_cin.set_index(['LOTE', 'ID Centrífuga'])


# In[70]:


id_c = check_centrifuga_id(centrifuga_horas, var, centrifuga_cin)
err = id_c[id_c.diff(axis=1).fillna(0).abs().sum(axis=1) > 0].index
id_c.loc[err]


# In[71]:


# para el resto, copiamos el de centrífuga horas, ya que al menos parece que las horas encajan bien
# cambiamos var:
change_var = id_c[(id_c.index.isin(err)) & (id_c['horas'] != id_c['cultivo'])].index
var.loc[change_var, 'ID Centrífuga'] = id_c.loc[change_var, 'horas']
# cambiamos centrifuga_cin:
change_cin = id_c[(id_c.index.isin(err)) & (id_c['horas'] != id_c['cinetico'])].index
centrifuga_cin = centrifuga_cin.reset_index('ID Centrífuga')
centrifuga_cin.loc[change_cin, 'ID Centrífuga'] = id_c.loc[change_cin, 'horas']
centrifuga_cin = centrifuga_cin.reset_index().set_index(['LOTE', 'ID Centrífuga'])


# In[72]:


id_c = check_centrifuga_id(centrifuga_horas, var, centrifuga_cin)
id_c[id_c.diff(axis=1).fillna(0).abs().sum(axis=1) > 0]


# #### centrifugación 2

# Hay dos datos que no tienen hora para la segunda centrifugación y, sin embargo, sí tienen Centrifugación 2 en cultivo y cinéticos. Suponemos que faltan las horas y las estimamos:

# In[73]:


centrifuga_horas[centrifuga_horas['LOTE'].isin(var.index)].isna().sum()


# In[74]:


centrifuga_horas[centrifuga_horas['DATEVALUE_ini_2'].isna()]


# In[75]:


missing_centrifugacion2_ = centrifuga_horas[centrifuga_horas['DATEVALUE_ini_2'].isna()].dropna(subset=['LOTE']).index
missing_centrifugacion2 = centrifuga_horas.loc[missing_centrifugacion2_].set_index('LOTE').index
var.loc[missing_centrifugacion2, 'Centrifugación 2 turbidez']


# In[76]:


centrifuga_cin.loc[missing_centrifugacion2, 'Turbidez_fin_2']


# In[77]:


d = [centrifuga_horas[['DATEVALUE_fin_1', 'DATEVALUE_ini_2']].diff(axis=1).iloc[:, 1].rename('lag1->2'), 
     centrifuga_horas[['DATEVALUE_ini_2', 'DATEVALUE_fin_2']].diff(axis=1).iloc[:, 1].rename('duration2')]
d = pd.concat(d, axis=1)
d[centrifuga_horas['LOTE'].isin(var.index)].describe()


# In[78]:


centrifuga_horas.loc[missing_centrifugacion2_, 'DATEVALUE_ini_2'] = centrifuga_horas.loc[missing_centrifugacion2_, 'DATEVALUE_fin_1'] + pd.Timedelta(1, 'h')
centrifuga_horas.loc[missing_centrifugacion2_, 'DATEVALUE_fin_2'] = centrifuga_horas.loc[missing_centrifugacion2_, 'DATEVALUE_ini_2'] + pd.Timedelta(1.5, 'h')
centrifuga_horas.loc[missing_centrifugacion2_]


# #### datos de centrifugas

# In[79]:


centrifuga_horas_ = centrifuga_horas.dropna(subset='LOTE').set_index('LOTE')


# In[80]:


centrifuga_c1 = [agg_temporal(centrifugas, lote, var.loc[lote, 'ID Centrífuga'], 
                 centrifuga_horas_.loc[lote, 'DATEVALUE_ini_1'], centrifuga_horas_.loc[lote, 'DATEVALUE_fin_1']) 
                 for lote in var.index if var.loc[lote, 'ID Centrífuga'] in centrifugas_id and lote in centrifuga_horas_.index]
centrifuga_c1 = pd.concat(centrifuga_c1)

centrifuga_c2 = [agg_temporal(centrifugas, lote, var.loc[lote, 'ID Centrífuga'], 
                 centrifuga_horas_.loc[lote, 'DATEVALUE_ini_2'], centrifuga_horas_.loc[lote, 'DATEVALUE_fin_2']) 
                 for lote in var.index if var.loc[lote, 'ID Centrífuga'] in centrifugas_id and lote in centrifuga_horas_.index]
centrifuga_c2 = pd.concat(centrifuga_c2)


# In[81]:


centrifuga = centrifuga_c1.join(centrifuga_c2, lsuffix='_1', rsuffix='_2')
centrifuga = centrifuga_horas_.drop(['Orden', 'ID Centrífuga'], axis=1).join(centrifuga, how='left') # añadimos las horas 


# In[82]:


centrifuga.shape


# In[83]:


var.index.difference(centrifuga.index) # son los que tienen un id centrífuga desconocido


# ### VARIABLES CATEGÓRICAS

# In[84]:


cat_bio_inoculo = pd.pivot_table(inoculo.reset_index(), index='index', columns='ID bioreactor', values='Viabilidad final cultivo', aggfunc='count')
cat_bio_inoculo = cat_bio_inoculo.fillna(0)
cat_bio_inoculo = cat_bio_inoculo.rename({x: 'ID bioreactor_' + str(int(x)) for x in cat_bio_inoculo.columns}, axis=1)
cat_bio_inoculo.sum(axis=1).value_counts()


# In[85]:


cat_bio_cultivo = pd.pivot_table(var.drop('LOTE', axis=1).reset_index(), index='LOTE', columns='ID Bioreactor', values='Fecha/hora inicio', aggfunc='count')
cat_bio_cultivo = cat_bio_cultivo.fillna(0)
cat_bio_cultivo = cat_bio_cultivo.rename({x: 'ID Bioreactor_' + str(int(x)) for x in cat_bio_cultivo.columns}, axis=1)
cat_bio_cultivo.sum(axis=1).value_counts()


# In[86]:


cat_centrifuga = pd.pivot_table(var.drop('LOTE', axis=1).reset_index(), index='LOTE', columns='ID Centrífuga', values='Fecha/hora inicio', aggfunc='count')
cat_centrifuga = cat_centrifuga.fillna(0).drop([x for x in cat_centrifuga if x not in centrifugas_id], axis=1)
cat_centrifuga = cat_centrifuga.rename({x: 'ID Centrífuga_' + str(int(x)) for x in cat_centrifuga.columns}, axis=1)
cat_centrifuga.sum(axis=1).value_counts()


# In[87]:


# intercambiamos los ids por las variables categóricas 
inoculo = inoculo.drop('ID bioreactor', axis=1).join(cat_bio_inoculo)
var = var.drop('ID Bioreactor', axis=1).join(cat_bio_cultivo)
var = var.drop('ID Centrífuga', axis=1).join(cat_centrifuga)


# ### FECHAS

# In[88]:


ch = centrifuga_horas.dropna(subset='LOTE').set_index(['LOTE'])[['DATEVALUE_ini_1', 'DATEVALUE_fin_1', 'DATEVALUE_ini_2', 'DATEVALUE_fin_2']]

keys = ['preinoculo', 'inoculo', 'inoculo_cin', 'cultivo', 'cultivo_cin', 'centrifugación',  'centrifugación1',  'centrifugación2', 
        'inoculo_ini', 'inoculo_fin', 'cultivo_ini', 'cultivo_fin',
        'pre->ino', 'pre->ino_cin', 'ino->cultivo', 'ino->cultivo_cin', 'ino_cin->cultivo', 'ino_cin->cultivo_cin', 'cultivo->centri', 'cultivo_cin->centri',
        'centri1->centri2'
       ]
dates = [
    [preinoculo['Fecha/hora inicio'], preinoculo['Fecha/hora fin']], [inoculo['Fecha/hora inicio'], inoculo['Fecha/hora fin']],
    [inoculo_cin['Fecha_ini'], inoculo_cin['Fecha_fin']], [var['Fecha/hora inicio'], var['Fecha/hora fin']],
    [cultivo_cin['Fecha_ini'], cultivo_cin['Fecha_fin']], [ch['DATEVALUE_ini_1'], ch['DATEVALUE_fin_2']], 
    [ch['DATEVALUE_ini_1'], ch['DATEVALUE_fin_1']], [ch['DATEVALUE_ini_2'], ch['DATEVALUE_fin_2']], 
    [inoculo['Fecha/hora inicio'], inoculo_cin['Fecha_ini']], [inoculo['Fecha/hora fin'], inoculo_cin['Fecha_fin']], 
    [var['Fecha/hora inicio'], cultivo_cin['Fecha_ini']], [var['Fecha/hora fin'], cultivo_cin['Fecha_fin']],
    
    # preinóculo -> inóculo -> cultivo -> centrifugación      
    [preinoculo['Fecha/hora fin'], inoculo['Fecha/hora inicio']], [preinoculo['Fecha/hora fin'], inoculo_cin['Fecha_ini']],
    [inoculo['Fecha/hora fin'], var['Fecha/hora inicio']], [inoculo['Fecha/hora fin'], cultivo_cin['Fecha_ini']], 
    [inoculo_cin['Fecha_fin'], var['Fecha/hora inicio']], [inoculo_cin['Fecha_fin'], cultivo_cin['Fecha_ini']], 
    [var['Fecha/hora fin'], ch['DATEVALUE_ini_1']], [cultivo_cin['Fecha_fin'], ch['DATEVALUE_ini_1']],  
    [ch['DATEVALUE_fin_1'], ch['DATEVALUE_ini_2']]
]
dates = [pd.concat(x, axis=1) for x in dates]
dates_diff = [x.max(axis=1) - x.min(axis=1) for x in dates]
dates_diff = pd.concat(dates_diff, axis=1, keys=keys).loc[var.index]
dates_diff.describe()


# In[89]:


proceso_lags = [dates_diff[col].dt.total_seconds() for col in dates_diff.columns]
proceso_lags = (pd.concat(proceso_lags, axis=1) / 60).round() # en minutos
to_drop =['inoculo_ini', 'inoculo_fin', 'cultivo_ini', 'cultivo_fin', 'cultivo_cin->centri', 'ino->cultivo_cin', 
          'ino_cin->cultivo_cin', 'cultivo_cin', 'inoculo_cin', 'pre->ino_cin', 'ino_cin->cultivo'] # borro las redundantes
proceso_lags = proceso_lags.drop(to_drop, axis=1)


# #### day of the year

# Esta variable podría tener sentido antes de introducir el dataset de ambientales. Como ya se ha incluido este dataset, es prescindible y la descartamos por simplicidad.

# In[90]:


# (np.cos(2*np.pi / 365 * pd.Series(range(1, 366)))).plot()
# days = np.cos(2 * np.pi / 365 * var['Fecha/hora inicio'].dt.dayofyear).rename('day_of_the_year')
# proceso_lags = proceso_lags.join(days)


# ### MATERIALES

# In[91]:


# hay cantidades negativas que parecen correcciones, las sumamos
material = material.groupby(['LOTE', 'Material', 'Lote interno', 'Lote Proveedor', 'Fecha recepción', 'Fecha traslado']).sum().reset_index()
material = material[material['Qty'] > 0]

# proporción de cada lote interno en la cantidad total del material 
total = material.groupby(['LOTE', 'Material']).sum(numeric_only=True)['Qty'].rename('total').reset_index()
material = material.merge(total, how='left', on=['LOTE', 'Material'])
material['proporcion'] = material['Qty'] / material['total']

# tiempo en el almacén principal
material['tiempo_almacén'] = (material['Fecha traslado'] - material['Fecha recepción']).dt.days


# In[92]:


# fecha final de cada cultivo
material = material.merge(var[['Fecha/hora inicio', 'Fecha/hora fin']], how='left', left_on='LOTE', right_index=True)
material = material[material['LOTE'].isin(var.index)]

material['lag_recepción'] = (material['Fecha/hora fin'] - material['Fecha recepción']).dt.days
material['lag_traslado'] = (material['Fecha/hora fin'] - material['Fecha traslado']).dt.days


# In[93]:


# algunos datos de traslado parecen incorrectos
material.sort_values(['lag_traslado']).head(20)


# In[94]:


# corregimos las fechas de traslado de los lotes 23027 y 23065
to_fix = material[material['lag_traslado'] < 0].index
cols = [ 'Material', 'Lote interno', 'Lote Proveedor', 'Fecha recepción']
for i in to_fix:
    aux = pd.DataFrame([material.loc[i, cols].values] * material.shape[0], index=material.index, columns=cols)
    fechas = material[(material[cols] ==aux).all(axis=1)]['Fecha traslado'].value_counts()
    fechas = fechas[fechas.index < material.loc[i, 'Fecha/hora fin']]
    material.loc[i, 'Fecha traslado'] = fechas.index[0]

material['lag_traslado'] = (material['Fecha/hora fin'] - material['Fecha traslado']).dt.days

# hacemos la suma ponderada por la cantidad 
material['tiempo_almacén_w'] = material['tiempo_almacén'] * material['proporcion']
material['lag_recepción_w'] = material['lag_recepción'] * material['proporcion']
material['lag_traslado_w'] = material['lag_traslado'] * material['proporcion']


# In[95]:


material[['Fecha/hora inicio', 'Fecha traslado']].idxmax(axis=1).value_counts()


# In[96]:


aux = material['Fecha traslado'] + pd.Timedelta(1, 'D')
material['Fecha almacen prod'] = pd.concat([material['Fecha/hora inicio'], aux], axis=1).max(axis=1)


# In[97]:


# cantidad de material
qty = pd.pivot_table(material, index='LOTE', columns='Material', values='Qty', aggfunc='sum').fillna(0)
# proporción de cada material en el total de cantidades utilizadas
proporcion = (qty.T / qty.sum(axis=1)).T
# días de cada material en el almacén principal
almacen = pd.pivot_table(material, index='LOTE', columns='Material', values='tiempo_almacén_w', aggfunc='sum').fillna(0)
# días desde la recepción al final del cultivo
recepcion = pd.pivot_table(material, index='LOTE', columns='Material', values='lag_recepción_w', aggfunc='sum').fillna(0)
# días desde el traslado al final del cultivo
traslado = pd.pivot_table(material, index='LOTE', columns='Material', values='lag_traslado_w', aggfunc='sum').fillna(0)

materiales = pd.concat([qty, proporcion, almacen, recepcion, traslado], axis=1, keys=['qty', 'proporcion', 'almacen', 'recepcion', 'traslado'])
materiales.columns = [x[0] + '_' + str(x[1]) for x in materiales.columns]
materiales.head()


# In[98]:


# las cantidades están muy correlacionadas con producto 2
c = [materiales.rank(pct=True).corrwith(var['Producto 1']), materiales.rank(pct=True).corrwith(var['Producto 2'])]
c = pd.concat(c, axis=1, keys=['Producto 1', 'Producto 2'])
c.abs().plot(subplots=True, layout=(1, 2), figsize=(15, 3), grid=True);


# In[99]:


# algunos de los lotes no aparecen en materiales. Dos son de test
missing_material = var.index.difference(materiales.index)
missing_material


# ### AMBIENTALES

# In[100]:


ambientales_ = {0: ambientales[['T_bios', 'H_bios']]}
amb_preinoculo = [agg_temporal(ambientales_, lote, 0, preinoculo.loc[lote, 'Fecha/hora inicio'], preinoculo.loc[lote, 'Fecha/hora fin']) 
                  for lote in preinoculo.index]
amb_preinoculo = pd.concat(amb_preinoculo)

amb_inoculo = [agg_temporal(ambientales_, lote, 0, inoculo.loc[lote, 'Fecha/hora inicio'], inoculo.loc[lote, 'Fecha/hora fin']) 
                  for lote in inoculo.index]
amb_inoculo = pd.concat(amb_inoculo)

amb_cultivo = [agg_temporal(ambientales_, lote, 0, var.loc[lote, 'Fecha/hora inicio'], var.loc[lote, 'Fecha/hora fin']) 
                  for lote in var.index]
amb_cultivo = pd.concat(amb_cultivo)


# In[101]:


ambientales_ = {0: ambientales[['T_centri', 'H_centri']]}
amb_centrifuga = [agg_temporal(ambientales_, lote, 0, centrifuga.loc[lote, 'DATEVALUE_ini_1'], centrifuga.loc[lote, 'DATEVALUE_fin_2']) 
                  for lote in centrifuga.index]
amb_centrifuga = pd.concat(amb_centrifuga)
amb_centrifuga.shape


# In[102]:


ambientales_ = {0: ambientales[['T_alma_princ', 'H_alma_princ']]}
amb_almacen = [agg_temporal(ambientales_, i, 0, material.loc[i, 'Fecha recepción'], material.loc[i, 'Fecha traslado']) 
                for i in material.index]
amb_almacen = pd.concat(amb_almacen)

ambientales_ = {0: ambientales[['T_alma_prod', 'H_alma_prod']]}
amb_almacen2 = [agg_temporal(ambientales_, i, 0, material.loc[i, 'Fecha traslado'], material.loc[i, 'Fecha almacen prod']) 
                for i in material.index]
amb_almacen2 = pd.concat(amb_almacen2)


# In[103]:


amb_almacen_by_material = []
for col in amb_almacen.columns:
    m = material[~amb_almacen[col].isna()]
    t = m.groupby(['LOTE', 'Material']).sum(numeric_only=True)['Qty'].rename('Total')
    m = m.merge(t, how='left', left_on=['LOTE', 'Material'], right_index=True)
    p = (m['Qty'] / m['Total']).fillna(1)
    j = pd.concat([material[['LOTE', 'Material']], (amb_almacen[col] * p).rename(col)], axis=1)
    j = j[~amb_almacen[col].isna()]
    s = j.groupby(['LOTE', 'Material']).sum()
    v = pd.pivot_table(s, index='LOTE', columns='Material')
    v.columns = [x[0] + '_' + str(x[1]) for x in v.columns]
    amb_almacen_by_material.append(v)

amb_almacen_by_material = pd.concat(amb_almacen_by_material, axis=1)

amb_almacen_2_by_material = []
for col in amb_almacen2.columns:
    m = material[~amb_almacen2[col].isna()]
    t = m.groupby(['LOTE', 'Material']).sum(numeric_only=True)['Qty'].rename('Total')
    m = m.merge(t, how='left', left_on=['LOTE', 'Material'], right_index=True)
    p = (m['Qty'] / m['Total']).fillna(1)
    j = pd.concat([material[['LOTE', 'Material']], (amb_almacen2[col] * p).rename(col)], axis=1)
    j = j[~amb_almacen2[col].isna()]
    s = j.groupby(['LOTE', 'Material']).sum()
    v = pd.pivot_table(s, index='LOTE', columns='Material')
    v.columns = [x[0] + '_' + str(x[1]) for x in v.columns]
    amb_almacen_2_by_material.append(v)

amb_almacen_2_by_material = pd.concat(amb_almacen_2_by_material, axis=1)


# ### DEPENDENCIAS

# In[104]:


to_drop = ['LOTE', 'TEST', 'LOTE previo', 'LOTE parental', 'Orden en el encadenado']
new_cultivo = var.drop(to_drop, axis=1)
p1 = previo[['lag', 'peso_minimo']].rename({'lag': 'lag_previo', 'peso_minimo': 'peso_previo'}, axis=1)
p1['lag_previo'] = p1['lag_previo'].dt.total_seconds().fillna(10 * 24 * 60 * 60) # 10 días
p1['peso_previo'] = p1['peso_previo'].fillna(0)
p2 = parental.dropna(subset='LOTE parental').set_index('LOTE parental')[['peso_minimo_bio', 'peso_reduccion_bio']]
p2 = p2.rename({'peso_minimo_bio': 'peso_siguiente', 'peso_reduccion_bio': 'peso_reduccion'}, axis=1).reindex(var.index).fillna(0)
p2['peso_siguiente'] = p2['peso_siguiente'].fillna(0)
dependencias = var[['Orden en el encadenado']].join(p1).join(p2)


# In[105]:


dependencias.loc[dependent].plot(subplots=True, layout=(1, 5), figsize=(15, 3), grid=True);
plt.tight_layout()
dependencias.loc[independent].plot(subplots=True, layout=(1, 5), figsize=(15, 3), grid=True);
plt.tight_layout()


# In[106]:


dependencias.loc[check['is_parental']].plot(subplots=True, layout=(1, 5), figsize=(15, 3));
plt.tight_layout()
dependencias.loc[~check['is_parental']].plot(subplots=True, layout=(1, 5), figsize=(15, 3));
plt.tight_layout()


# In[107]:


dependencias.rank(pct=True).corr()


# In[108]:


dependencias.rank(pct=True).corrwith(var['Producto 1'])


# ### FASES vs CINÉTICOS

# Rellenamos datos con los conjuntos que tienen información redundante

# In[109]:


# inóculo vs cinéticos
rename = {'Fecha_ini': 'Fecha/hora inicio', 'Fecha_fin': 'Fecha/hora fin',
          'Turbidez_ini': 'Turbidez inicio cultivo', 'Turbidez_fin': 'Turbidez final culttivo'}
a = inoculo[rename.values()]
b = inoculo_cin.rename(rename, axis=1)[rename.values()]
c = (a - b)
c['Fecha/hora inicio'] = pd.concat([a['Fecha/hora inicio'], b['Fecha/hora inicio']], axis=1).max(axis=1) - pd.concat([a['Fecha/hora inicio'], b['Fecha/hora inicio']], axis=1).min(axis=1) 
c['Fecha/hora inicio'] = c['Fecha/hora inicio'].dt.days
c['Fecha/hora fin'] = pd.concat([a['Fecha/hora fin'], b['Fecha/hora fin']], axis=1).max(axis=1) - pd.concat([a['Fecha/hora fin'], b['Fecha/hora fin']], axis=1).min(axis=1) 
c['Fecha/hora fin'] = c['Fecha/hora fin'].dt.days
idx = c.index.intersection(cultivo.index.union(test.index))
c.loc[idx].plot(subplots=True, layout=(2, 2), grid=True, figsize=(12, 6));


# In[110]:


# cultivo vs cinéticos 
rename = {'Fecha_ini': 'Fecha/hora inicio', 'Fecha_fin': 'Fecha/hora fin', 'Turbidez_ini': 'Turbidez inicio cultivo', 'Turbidez_fin': 'Turbidez fin cultivo'}
a = pd.concat([cultivo[list(rename.values())], test[list(rename.values())]])
b = cultivo_cin.rename(rename, axis=1)[rename.values()]
c = (a - b)
c['Fecha/hora inicio'] = pd.concat([a['Fecha/hora inicio'], b['Fecha/hora inicio']], axis=1).max(axis=1) - pd.concat([a['Fecha/hora inicio'], b['Fecha/hora inicio']], axis=1).min(axis=1) 
c['Fecha/hora inicio'] = c['Fecha/hora inicio'].dt.days
c['Fecha/hora fin'] = pd.concat([a['Fecha/hora fin'], b['Fecha/hora fin']], axis=1).max(axis=1) - pd.concat([a['Fecha/hora fin'], b['Fecha/hora fin']], axis=1).min(axis=1) 
c['Fecha/hora fin'] = c['Fecha/hora fin'].dt.days
idx = c.index.intersection(cultivo.index.union(test.index))
c.loc[idx].plot(subplots=True, layout=(2, 2), grid=True, figsize=(12, 6));


# In[111]:


# centrifugación vs cinéticos 
rename = {'ID Centrífuga': 'ID Centrífuga',
          'Turbidez_fin_1': 'Centrifugación 1 turbidez', 'Turbidez_2': 'Centrifugación 2 turbidez'}
a = pd.concat([cultivo[list(rename.values())], test[list(rename.values())]])
b = centrifuga_cin.reset_index('ID Centrífuga').rename(rename, axis=1)[rename.values()]
c = (a - b)
idx = c.index.intersection(cultivo.index.union(test.index))
c.loc[idx].plot(subplots=True, layout=(2, 2), grid=True, figsize=(12, 6));


# In[112]:


pairs = [['Fecha/hora inicio', 'Fecha_ini'], 
         ['Fecha/hora fin', 'Fecha_fin'], 
         ['Viabilidad final cultivo', 'Viabilidad_fin'], 
         ['Turbidez final culttivo', 'Turbidez_fin'], 
         ['Turbidez inicio cultivo', 'Turbidez_ini'], ]
for x, y in pairs:
    inoculo[x] = inoculo[x].fillna(inoculo_cin[y])
    inoculo_cin[y] = inoculo_cin[y].fillna(inoculo[x])


# In[113]:


pairs = [['Fecha/hora inicio', 'Fecha_ini'], 
         ['Fecha/hora fin', 'Fecha_fin'], 
         ['Turbidez fin cultivo', 'Turbidez_fin'],
         ['Turbidez inicio cultivo', 'Turbidez_ini'],]

for x, y in pairs:
    var[x] = var[x].fillna(cultivo_cin[y])
    cultivo_cin[y] = cultivo_cin[y].fillna(var[x])


# In[114]:


pairs = [['Centrifugación 1 turbidez', 'Turbidez_ini_1'],
         ['Centrifugación 2 turbidez', 'Turbidez_2'],]

for x, y in pairs:
    var[x] = var[x].fillna(centrifuga_cin[y])
    centrifuga_cin[y] = centrifuga_cin[y].fillna(var[x])


# ### FEATURES

# In[115]:


keys = ['of', 'preinoculo', 'inoculo', 'cultivo', 'dependencias',
        'inoculocin', 'cultivocin', 'centrifugacin',
        'bioinoculo', 'biocultivo', 'centrifuga', 'proceso_lags', 'materiales',
        'amb_preinoculo', 'amb_almacen_by_material', 'amb_almacen_2_by_material', 
        'amb_inoculo', 'amb_cultivo', 'amb_centrifuga',
       ]
A = [of, preinoculo, inoculo, new_cultivo, dependencias]
A += [inoculo_cin, cultivo_cin, centrifuga_cin.reset_index('ID Centrífuga', drop=True)]
A += [bio_inoculo, bio_cultivo, centrifuga, proceso_lags]
A += [materiales]
A += [amb_preinoculo, amb_almacen_by_material, amb_almacen_2_by_material]
A += [amb_inoculo, amb_cultivo, amb_centrifuga]
A = pd.concat(A, axis=1, keys=keys)
A = A.loc[var.index]
A.columns = [x + '_' + y for x, y in A.columns]
viabilidad = [x for x in A.columns if 'viabilidad' in x.lower()]
A[viabilidad] = A[viabilidad] / 10**6 # los números grandes dan problemas
float_cols = A.dtypes[A.dtypes == 'object'].index
A[float_cols] = A[float_cols].astype(float)
A[~np.isfinite(A)] = float('nan')

# rename
A.columns = [x.replace('cultivo_ID Centrífuga_', 'centrifuga_ID Centrífuga_') for x in A.columns]
A.columns = [x.replace('inoculocin_', 'inoculo_') for x in A.columns]
A.columns = [x.replace('centrifugacin_', 'centrifugacion_') for x in A.columns]
A.columns = [x.replace('cultivocin_', 'cultivo_') for x in A.columns]
A.columns = [x.replace('cultivo_ID Bioreactor_', 'biocultivo_ID Bioreactor_') for x in A.columns]
rename = {
          'cultivo_Volumen de inóculo utilizado': 'inoculo_Volumen de inóculo utilizado',
          'cultivo_Centrifugación 1 turbidez': 'centrifugacion_Centrifugación 1 turbidez',
          'cultivo_Centrifugación 2 turbidez': 'centrifugacion_Centrifugación 2 turbidez',
         }
A = A.rename(rename, axis=1)
A.shape


# ### FEATURES FILTER

# In[116]:


A.shape


# In[117]:


filters = pd.Series(A.columns).str.split('_').str[0].value_counts().rename('original')


# In[118]:


# filtro 0 - manual
to_drop = ['of_Orden']
to_drop += ['preinoculo_Fecha/hora inicio',  'preinoculo_Fecha/hora fin',]
to_drop += ['inoculo_Fecha/hora inicio', 'inoculo_Fecha/hora fin',]
to_drop += ['cultivo_Fecha/hora inicio', 'cultivo_Fecha/hora fin', 
            'cultivo_Producto 1', 'cultivo_Producto 2',]
to_drop += ['inoculo_Fecha_ini', 'inoculo_Fecha_fin', 'inoculo_count',]
to_drop += ['cultivo_Fecha_ini', 'cultivo_Fecha_fin', 'cultivo_count',]
to_drop += ['centrifugacion_count_1', 'centrifugacion_count_2',]
to_drop += ['centrifuga_DATEVALUE_ini_1', 'centrifuga_DATEVALUE_fin_1', 'centrifuga_DATEVALUE_ini_2', 'centrifuga_DATEVALUE_fin_2']
B = A.drop(to_drop, axis=1)
print(len(to_drop), B.shape)


# In[119]:


filters = pd.DataFrame(filters).join(pd.Series(B.columns).str.split('_').str[0].value_counts().rename('manual'))


# In[120]:


# filtro 1 - eliminamos las variables que no varían:
to_drop = B.nunique()[B.nunique() == 1].index
B = B.drop(to_drop, axis=1)
print(len(to_drop), B.shape)


# In[121]:


filters = filters.join(pd.Series(B.columns).str.split('_').str[0].value_counts().rename('nunique'))


# In[122]:


# filtro 2 - eliminamos las variables que no varían:
limit = 0.9
same_value = ((B == B.mode().iloc[0]).sum() / (~B.isna()).sum()) 
to_drop = same_value[same_value > 0.9].index
B = B.drop(to_drop, axis=1)
print(len(to_drop), B.shape)


# In[123]:


filters = filters.join(pd.Series(B.columns).str.split('_').str[0].value_counts().rename('almost_nunique'))


# In[124]:


# filtro 3 - nans
nonan_limit = 0.75
to_drop = B.columns[(B.isna().sum() / B.shape[0]) > nonan_limit]
B = B.drop(to_drop, axis=1)
print(len(to_drop), B.shape)


# In[125]:


filters = filters.join(pd.Series(B.columns).str.split('_').str[0].value_counts().rename('nans'))


# In[126]:


# filtro 4 - redundancia
corr_p = B.rank(pct=True).corrwith(A['cultivo_Producto 1']).abs()

# priorizamos las que tienen menos nans y son versiones más simples (nombre corto)
r = pd.concat([(~B[~A['cultivo_Producto 1'].isna()].isna()).sum() / B[~A['cultivo_Producto 1'].isna()].shape[0], 
               pd.Series(B.columns.str.len(), index=B.columns), corr_p], axis=1, keys=['nan', 'first', 'corr_p']).sort_values(['nan', 'first', 'corr_p'], ascending=[1, -1, 1])
r = r.sort_values(['nan', 'first'], ascending=[1, -1])

limit = 0.9
selection = pd.Index([])
discard = pd.Index([])
corr = B[~A['cultivo_Producto 1'].isna()].rank(pct=True).corr().abs() # correlación solamente train
corr_ = corr.melt(ignore_index=False).reset_index()
corr_ = corr_[corr_['index'] != corr_['variable']]
corr_ = pd.pivot_table(corr_, index='index', columns='variable', values='value')
max_corr = corr_.max(axis=1)
max_var_corr = corr_.idxmax(axis=1)
corr_ = pd.concat([max_var_corr, max_corr], axis=1)
for i in r.index:
    if i not in discard and i not in selection:
        t = corr.loc[i][corr.loc[i] > limit].index
        if len(selection.intersection(t)) == 0:
            tt = t.difference(discard)
            aux = r.loc[tt].sort_values(by=['nan', 'first'], ascending=[1, -1])
            selection = selection.union(aux.index[[-1]])
            discard = discard.union(aux.index[:-1])                
        else:
            discard = discard.union(r.loc[[i]].index)    

B = B[selection]
[len(selection), len(discard)]


# In[127]:


filters = filters.join(pd.Series(B.columns).str.split('_').str[0].value_counts().rename('redundancy'))


# In[128]:


filters


# In[129]:


B = pd.concat([B, A[['cultivo_Producto 1', 'cultivo_Producto 2']]], axis=1)


# In[130]:


B.shape


# ### CORRELACIÓN CON PRODUCTO 1

# In[131]:


features = B.loc[var[~var['TEST']].index]
features.shape


# In[132]:


features['cultivo_Producto 1'].mean(), features['cultivo_Producto 1'].std()


# In[133]:


features.rank(pct=True).corr()['cultivo_Producto 1'].drop('cultivo_Producto 1').sort_values().plot(figsize=(15, 3), grid=True, title='Correlación con Producto 1');


# In[134]:


features.rank(pct=True).corr()['cultivo_Producto 1'].drop('cultivo_Producto 1').abs().sort_values(ascending=False).head(30).plot.bar(figsize=(15, 3), grid=True, title='Top Correlación con Producto 1');


# In[135]:


count_corr = features.rank(pct=True).corr()['cultivo_Producto 1'].drop(['cultivo_Producto 1', 'cultivo_Producto 2']).abs().round(1).clip(upper=0.3).value_counts().sort_index()
count_corr / count_corr.sum()


# ### IMPUTACIÓN DE NANS

# In[136]:


impute_kind = 'mean'
t = B[B['cultivo_Producto 1'].isna()].index
C = impute_nan(B.drop(['cultivo_Producto 1', 'cultivo_Producto 2'], axis=1), impute_kind, t)


# ### FEATURES SELECTION

# In[137]:


x_aux = B.drop(['cultivo_Producto 1', 'cultivo_Producto 2'], axis=1)
y_aux = B['cultivo_Producto 1'].dropna()
x_aux = x_aux.rank(pct=True) # spearman
x_aux = x_aux.loc[y_aux.index] # only_train

sel_corr = select_by_correlation(x_aux, y_aux)
sel_corr_red = {limit: select_by_correlation(x_aux, y_aux, limit) for limit in [0.8, 0.6]}
sel_pvalues = select_by_pvalues(x_aux, y_aux)
sel_pvalues_multi = select_by_pvalues_multi(x_aux, y_aux, 0.05)
sel_pvalues_multi_red = select_by_pvalues_multi_redundancy_drop(x_aux, y_aux, 0.15)

xs = [sel_corr, sel_corr_red[0.8], sel_corr_red[0.6], sel_pvalues, sel_pvalues_multi, sel_pvalues_multi_red]

x_aux = B.drop(['cultivo_Producto 1', 'cultivo_Producto 2'], axis=1)
y_aux = B['cultivo_Producto 1'].dropna()
x_aux = x_aux.rank(pct=True) # spearman
x_aux = x_aux.loc[y_aux.index] # only_train
x_aux_ = x_aux.drop([x for x in B.columns if 'materiales_' in x or 'amb_' in x], axis=1)

sel_pvalues_multi_1 = select_by_pvalues_multi(x_aux_, y_aux, 0.05)
to_try = x_aux.columns.difference(x_aux_.columns)
sel_pvalues_multi_2 = select_by_pvalues_multi_continue(x_aux, y_aux, 0.05, sel_pvalues_multi_1, to_try)

xs = [sel_corr, sel_corr_red[0.8], sel_corr_red[0.6], sel_pvalues, sel_pvalues_multi, sel_pvalues_multi_red, sel_pvalues_multi_2]
[len(x) for x in xs]


# In[138]:


x_aux = C.copy()
y_aux = B['cultivo_Producto 1'].dropna()
x_aux = x_aux.rank(pct=True) # spearman
x_aux = x_aux.loc[y_aux.index] # only_train

imp_sel_corr = select_by_correlation(x_aux, y_aux)
imp_sel_corr_red = {limit: select_by_correlation(x_aux, y_aux, limit) for limit in [0.8, 0.6]}
imp_sel_pvalues = select_by_pvalues(x_aux, y_aux)
imp_sel_pvalues_multi = select_by_pvalues_multi(x_aux, y_aux, 0.05)
imp_sel_pvalues_multi_red = select_by_pvalues_multi_redundancy_drop(x_aux, y_aux, 0.15)

x_aux = C.copy()
y_aux = B['cultivo_Producto 1'].dropna()
x_aux = x_aux.rank(pct=True) # spearman
x_aux = x_aux.loc[y_aux.index] # only_train
x_aux_ = x_aux.drop([x for x in B.columns if 'materiales_' in x or 'amb_' in x], axis=1)

imp_sel_pvalues_multi_1 = select_by_pvalues_multi(x_aux_, y_aux, 0.05)
to_try = x_aux.columns.difference(x_aux_.columns)
imp_sel_pvalues_multi_2 = select_by_pvalues_multi_continue(x_aux, y_aux, 0.05, imp_sel_pvalues_multi_1, to_try)

xs = [imp_sel_corr, imp_sel_corr_red[0.8], imp_sel_corr_red[0.6], imp_sel_pvalues, imp_sel_pvalues_multi, imp_sel_pvalues_multi_red, imp_sel_pvalues_multi_2]
[len(x) for x in xs]


# ### FEATURES STANDARDISATION

# In[139]:


scaler = StandardScaler()
scaler.fit(C[~B['cultivo_Producto 1'].isna()]);
D = scaler.transform(C)
D = pd.DataFrame(D, index=C.index, columns=C.columns)

# no se estandarizan las binarias
binary_cols = C.columns[C.nunique() == 2]
D[binary_cols] = C[binary_cols]


# ### MODELOS

# In[140]:


lotes = B.reset_index()['LOTE']
label = B.reset_index()['cultivo_Producto 1'].dropna()
features_all = B.reset_index().drop(['cultivo_Producto 1', 'cultivo_Producto 2', 'LOTE'], axis=1)
imputed_features_all = C.reset_index().drop(['LOTE'], axis=1)
z_imputed_features_all = D.reset_index().drop(['LOTE'], axis=1)

features = features_all.loc[label.index]
imputed_features = imputed_features_all.loc[label.index]
z_imputed_features = z_imputed_features_all.loc[label.index]
label.std()


# In[141]:


random_state = 1717
benchmark = ['mean']
scoring = 'neg_root_mean_squared_error'
cv_splitter = KFold(n_splits=5, shuffle=True, random_state=random_state)


# #### RANDOM FOREST

# In[142]:


max_depth = 3
max_features = 1
n_estimators = 100


# In[143]:


results = {}
for strategy in benchmark:
    dummy = DummyRegressor(strategy=strategy)    
    cv = cross_validate(dummy, features, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    results['bench_' + strategy] = pd.DataFrame(cv)


# In[144]:


X = {}
n_features = [5, 10, 20, 30]

X['pvalues_multi'] = features[sel_pvalues_multi]
X['pvalues_multi_red'] = features[sel_pvalues_multi_red]
X['pvalues_multi_2'] = features[sel_pvalues_multi_2]

X['imp_pvalues_multi'] = features[imp_sel_pvalues_multi]
X['imp_pvalues_multi_red'] = features[imp_sel_pvalues_multi_red]
X['imp_pvalues_multi_2'] = features[imp_sel_pvalues_multi_2]

for n in n_features:
    X['corr_n=' + str(n)] = features[sel_corr.index[:n]]
    X['imp_corr_n=' + str(n)] = features[imp_sel_corr.index[:n]]

    for limit in [0.6, 0.8]:
        X['corr_red_' + str(limit) + '_n=' + str(n)]= features[sel_corr_red[limit][:n]]
        X['imp_corr_red_' + str(limit) + '_n=' + str(n)]= features[imp_sel_corr_red[limit][:n]]
        
    X['pvalues_n=' + str(n)] = features[sel_pvalues.index[:n]]
    X['imp_pvalues_n=' + str(n)] = features[imp_sel_pvalues.index[:n]]
    
for key, value in X.items():
    model = RandomForestRegressor(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=random_state)
    cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    key_ = key + '_' + str(max_depth) + '_' + str(max_features)
    results[key_] = pd.DataFrame(cv)


# In[145]:


m = pd.DataFrame({key: [value['test_score'].mean(), value['test_score'].std(), value['train_score'].mean(), ] for key, value in results.items()}, index=['avg', 'std', 'train']).T
m['diff_train_test'] = m['avg'] - m['train']

m.sort_index().plot(subplots=True, layout=(1, 4), figsize=(20, 3))

m.sort_values('avg', ascending=False)


# In[146]:


n = 20
limit = 0.8
X_ = features[sel_corr_red[limit][:n]]

model = RandomForestRegressor(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=random_state)
cv = cross_validate(model, X_, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
pred = []
for x, y in cv_splitter.split(X_):
    model.fit(X_.loc[x], label.loc[x])
    pred.append(pd.Series(model.predict(X_.loc[y]), index=y))
pred = pd.concat(pred).sort_index()


# In[147]:


fig, ax = plt.subplots(1, 2, figsize=(10, 4), tight_layout=True)
err = root_mean_squared_error(pred, label)
vs = pd.concat([label, pred], axis=1, keys=['label', 'pred'])
vs.plot.scatter(x='label', y='pred', grid=True, ax=ax[0])
ax[0].axline((1500, 1500), slope=1, color='k')
pd.concat([label, pred], axis=1).plot(grid=True, ax=ax[1])
ax[1].axhline(label.mean(), color='blue');
ax[1].axhline(pred.mean(), color='orange');
print(vs.corr().iloc[0, 1], err)


# In[148]:


X_.loc[lotes.loc[X_.index].sort_values().index].reset_index(drop=True).plot(subplots=True, layout=(4, 5), grid=True, figsize=(20, 10));


# In[149]:


importance = pd.Series(model.feature_importances_, index=X_.columns)
importance.sort_values(ascending=False).plot.bar(figsize=(15, 3), grid=True)


# In[150]:


## SESGOS
vs['diff'] = vs.iloc[:, :2].diff(axis=1).iloc[:, 1]
vs['is_parental'] = lotes.loc[vs.index].isin(var['LOTE parental'].dropna().to_list())
vs['has_parental'] = (~var.loc[lotes.loc[vs.index].values, 'LOTE parental'].isna()).values

fig, ax = plt.subplots(3, 1, figsize=(5, 8), tight_layout=True);
ax = ax.ravel()
vs['diff'].plot.area(stacked=False, title='Error by date', ax=ax[0]);
vs[vs['is_parental']]['diff'].rename('is_parental').plot.kde(ax=ax[1], legend=True)
vs[~vs['is_parental']]['diff'].rename('not_is_parental').plot.kde(title='Error by is_parental', legend=True, ax=ax[1])
vs[vs['has_parental']]['diff'].rename('has_parental').plot.kde(ax=ax[2], legend=True)
vs[~vs['has_parental']]['diff'].rename('not_has_parental').plot.kde(title='Error by has_parental', legend=True, ax=ax[2])
[a.grid() for a in ax];


# In[151]:


X_.columns


# #### XGBOOST

# In[152]:


max_depth = 3
n_estimators = 100
learning_rate = 0.05
alpha = 0.5


# In[153]:


results = {}
for strategy in benchmark:
    dummy = DummyRegressor(strategy=strategy)    
    cv = cross_validate(dummy, features, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    results['bench_' + strategy] = pd.DataFrame(cv)


# In[154]:


X = {}
n_features = [5, 10, 20, 30]

X['pvalues_multi'] = features[sel_pvalues_multi]
X['pvalues_multi_red'] = features[sel_pvalues_multi_red]
X['pvalues_multi_2'] = features[sel_pvalues_multi_2]

X['imp_pvalues_multi'] = features[imp_sel_pvalues_multi]
X['imp_pvalues_multi_red'] = features[imp_sel_pvalues_multi_red]
X['imp_pvalues_multi_2'] = features[imp_sel_pvalues_multi_2]

for n in n_features:
    X['corr_n=' + str(n)] = features[sel_corr.index[:n]]
    X['imp_corr_n=' + str(n)] = features[imp_sel_corr.index[:n]]

    for limit in [0.6, 0.8]:
        X['corr_red_' + str(limit) + '_n=' + str(n)]= features[sel_corr_red[limit][:n]]
        X['imp_corr_red_' + str(limit) + '_n=' + str(n)]= features[imp_sel_corr_red[limit][:n]]
        
    X['pvalues_n=' + str(n)] = features[sel_pvalues.index[:n]]
    X['imp_pvalues_n=' + str(n)] = features[imp_sel_pvalues.index[:n]]

for key, value in X.items():
    model = xgb.XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate, alpha=alpha, random_state=random_state)
    cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    key_ = key + '_' + str(max_depth) + '_' + str(max_features)
    results[key_] = pd.DataFrame(cv)


# In[155]:


m = pd.DataFrame({key: [value['test_score'].mean(), value['test_score'].std(), value['train_score'].mean(), ] for key, value in results.items()}, index=['avg', 'std', 'train']).T
m['diff_train_test'] = m['avg'] - m['train']

m.sort_index().plot(subplots=True, layout=(1, 4), figsize=(20, 3))

m.sort_values('avg', ascending=False)


# In[156]:


n = 30
X_ = features[sel_pvalues[:n].index]

model = xgb.XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate, alpha=alpha, random_state=random_state)
cv = cross_validate(model, X_, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
pred = []
for x, y in cv_splitter.split(X_):
    model.fit(X_.loc[x], label.loc[x])
    pred.append(pd.Series(model.predict(X_.loc[y]), index=y))
pred = pd.concat(pred).sort_index()


# In[157]:


fig, ax = plt.subplots(1, 2, figsize=(10, 4), tight_layout=True)
err = root_mean_squared_error(pred, label)
vs = pd.concat([label, pred], axis=1, keys=['label', 'pred'])
vs.plot.scatter(x='label', y='pred', grid=True, ax=ax[0])
ax[0].axline((1500, 1500), slope=1, color='k')
pd.concat([label, pred], axis=1).plot(grid=True, ax=ax[1])
ax[1].axhline(label.mean(), color='blue');
ax[1].axhline(pred.mean(), color='orange');
print(vs.corr().iloc[0, 1], err)


# In[158]:


X_.loc[lotes.loc[X_.index].sort_values().index].reset_index(drop=True).plot(subplots=True, layout=(6, 5), grid=True, figsize=(20, 10));


# In[159]:


importance = pd.Series(model.feature_importances_, index=X_.columns)
importance.sort_values(ascending=False).plot.bar(figsize=(15, 3), grid=True)


# In[160]:


## SESGOS
vs['diff'] = vs.iloc[:, :2].diff(axis=1).iloc[:, 1]
vs['is_parental'] = lotes.loc[vs.index].isin(var['LOTE parental'].dropna().to_list())
vs['has_parental'] = (~var.loc[lotes.loc[vs.index].values, 'LOTE parental'].isna()).values

fig, ax = plt.subplots(3, 1, figsize=(5, 8), tight_layout=True);
ax = ax.ravel()
vs['diff'].plot.area(stacked=False, title='Error by date', ax=ax[0]);
vs[vs['is_parental']]['diff'].rename('is_parental').plot.kde(ax=ax[1], legend=True)
vs[~vs['is_parental']]['diff'].rename('not_is_parental').plot.kde(title='Error by is_parental', legend=True, ax=ax[1])
vs[vs['has_parental']]['diff'].rename('has_parental').plot.kde(ax=ax[2], legend=True)
vs[~vs['has_parental']]['diff'].rename('not_has_parental').plot.kde(title='Error by has_parental', legend=True, ax=ax[2])
[a.grid() for a in ax];


# In[161]:


X_.columns


# #### SVM

# In[162]:


coef = 0.5
kernel = 'rbf'


# In[163]:


results = {}
for strategy in benchmark:
    dummy = DummyRegressor(strategy=strategy)    
    cv = cross_validate(dummy, features, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    results['bench_' + strategy] = pd.DataFrame(cv)


# In[164]:


X = {}
n_features = [5, 10, 20, 30]

X['imp_pvalues_multi'] = z_imputed_features[imp_sel_pvalues_multi]
X['imp_pvalues_multi_red'] = z_imputed_features[imp_sel_pvalues_multi_red]
X['imp_pvalues_multi_2'] = z_imputed_features[imp_sel_pvalues_multi_2]

for n in n_features:
    X['imp_corr_n=' + str(n)] = z_imputed_features[imp_sel_corr.index[:n]]

    for limit in [0.6, 0.8]:
        X['imp_corr_red_' + str(limit) + '_n=' + str(n)]= z_imputed_features[imp_sel_corr_red[limit][:n]]
        
    X['imp_pvalues_n=' + str(n)] = z_imputed_features[imp_sel_pvalues.index[:n]]

for key, value in X.items():
    model = SVR(C=coef, kernel=kernel)
    cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    key_ = key + '_' + 'svr_' + str(coef)
    results[key_] = pd.DataFrame(cv)


# In[165]:


m = pd.DataFrame({key: [value['test_score'].mean(), value['test_score'].std(), value['train_score'].mean(), ] for key, value in results.items()}, index=['avg', 'std', 'train']).T
m['diff_train_test'] = m['avg'] - m['train']

m.plot(subplots=True, layout=(1, 4), figsize=(20, 3))

m.sort_values('avg', ascending=False)


# #### ELASTICNET

# In[166]:


alpha = 1
l1_ratio = 0.5


# In[167]:


results = {}
for strategy in benchmark:
    dummy = DummyRegressor(strategy=strategy)    
    cv = cross_validate(dummy, features, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    results['bench_' + strategy] = pd.DataFrame(cv)


# In[168]:


X = {}
n_features = [5, 10, 20, 30]
X['imp_pvalues_multi'] = z_imputed_features[imp_sel_pvalues_multi]
X['imp_pvalues_multi_red'] = z_imputed_features[imp_sel_pvalues_multi_red]
X['imp_pvalues_multi_2'] = z_imputed_features[imp_sel_pvalues_multi_2]

for n in n_features:
    X['imp_corr_n=' + str(n)] = z_imputed_features[imp_sel_corr.index[:n]]

    for limit in [0.6, 0.8]:
        X['imp_corr_red_' + str(limit) + '_n=' + str(n)]= z_imputed_features[imp_sel_corr_red[limit][:n]]
        
    X['imp_pvalues_n=' + str(n)] = z_imputed_features[imp_sel_pvalues.index[:n]]

for key, value in X.items():
    #model = LinearRegression()
    #cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    #results[key + '_' + 'lr_'] = pd.DataFrame(cv)
    #
    #model = Ridge(alpha=alpha, random_state=random_state)
    #cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    #results[key + '_' + 'ridge_' + str(alpha)] = pd.DataFrame(cv)
    #
    #model = Lasso(alpha=alpha, random_state=random_state)
    #cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    #results[key + '_' + 'lasso_' + str(alpha)] = pd.DataFrame(cv)

    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
    cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    results[key + '_' + 'enet_' + str(alpha) + '_l1_' + str(l1_ratio)] = pd.DataFrame(cv)


# In[169]:


m = pd.DataFrame({key: [value['test_score'].mean(), value['test_score'].std(), value['train_score'].mean(), ] for key, value in results.items()}, index=['avg', 'std', 'train']).T
m['diff_train_test'] = m['avg'] - m['train']

m.sort_index().plot(subplots=True, layout=(1, 4), figsize=(20, 3))

m.sort_values('avg', ascending=False)


# In[170]:


X_ = z_imputed_features[imp_sel_pvalues_multi_2]


# In[171]:


model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
cv = cross_validate(model, X_, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
pred = []
for x, y in cv_splitter.split(X_):
    model.fit(X_.loc[x], label.loc[x])
    p = model.predict(X_.loc[y])
    pred.append(pd.Series(p, index=y))
pred = pd.concat(pred).sort_index()


# In[172]:


fig, ax = plt.subplots(1, 2, figsize=(10, 4), tight_layout=True)
err = root_mean_squared_error(pred, label)
vs = pd.concat([label, pred], axis=1, keys=['label', 'pred'])
vs.plot.scatter(x='label', y='pred', grid=True, ax=ax[0])
ax[0].axline((1500, 1500), slope=1, color='k')
pd.concat([label, pred], axis=1).plot(grid=True, ax=ax[1])
ax[1].axhline(label.mean(), color='blue');
ax[1].axhline(pred.mean(), color='orange');
print(vs.corr().iloc[0, 1], err)


# In[173]:


X_.plot(subplots=True, layout=(4, 6), figsize=(20, 8), grid=True);


# In[174]:


coefs = pd.Series(model.coef_, index=X_.columns)
coefs.sort_values(ascending=False).plot.bar(figsize=(15, 3), grid=True, title='Features Coefs');


# In[175]:


## SESGOS
vs['diff'] = vs.iloc[:, :2].diff(axis=1).iloc[:, 1]
vs['is_parental'] = lotes.loc[vs.index].isin(var['LOTE parental'].dropna().to_list())
vs['has_parental'] = (~var.loc[lotes.loc[vs.index].values, 'LOTE parental'].isna()).values

fig, ax = plt.subplots(3, 1, figsize=(5, 8), tight_layout=True);
ax = ax.ravel()
vs['diff'].plot.area(stacked=False, title='Error by date', ax=ax[0]);
vs[vs['is_parental']]['diff'].rename('is_parental').plot.kde(ax=ax[1], legend=True)
vs[~vs['is_parental']]['diff'].rename('not_is_parental').plot.kde(title='Error by is_parental', legend=True, ax=ax[1])
vs[vs['has_parental']]['diff'].rename('has_parental').plot.kde(ax=ax[2], legend=True)
vs[~vs['has_parental']]['diff'].rename('not_has_parental').plot.kde(title='Error by has_parental', legend=True, ax=ax[2])
[a.grid() for a in ax];


# In[176]:


X_.columns


# #### ELASTICNET SIMPLE MODEL

# In[177]:


c = z_imputed_features[imp_sel_pvalues_multi_2].rank(pct=True).corr().melt(ignore_index=False, value_name='corr').reset_index()
c = c[c['index'] < c['variable']]
c.sort_values('corr')


# In[178]:


to_drop = coefs[coefs.abs() < 20].index
to_drop


# In[179]:


X['imp_pvalues_multi_2_drop'] = z_imputed_features[imp_sel_pvalues_multi_2.difference(to_drop)]
for key, value in X.items():
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
    cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
    results[key + '_' + 'enet_' + str(alpha) + '_l1_' + str(l1_ratio)] = pd.DataFrame(cv)


# In[180]:


m = pd.DataFrame({key: [value['test_score'].mean(), value['test_score'].std(), value['train_score'].mean(), ] for key, value in results.items()}, index=['avg', 'std', 'train']).T
m['diff_train_test'] = m['avg'] - m['train']

m.sort_index().plot(subplots=True, layout=(1, 4), figsize=(20, 3))

m.sort_values('avg', ascending=False)


# In[181]:


X_ = z_imputed_features[imp_sel_pvalues_multi_2.difference(to_drop)]


# In[182]:


model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
cv = cross_validate(model, X_, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
pred = []
for x, y in cv_splitter.split(X_):
    model.fit(X_.loc[x], label.loc[x])
    p = model.predict(X_.loc[y])
    pred.append(pd.Series(p, index=y))
pred = pd.concat(pred).sort_index()


# In[183]:


fig, ax = plt.subplots(1, 2, figsize=(10, 4), tight_layout=True)
err = root_mean_squared_error(pred, label)
vs = pd.concat([label, pred], axis=1, keys=['label', 'pred'])
vs.plot.scatter(x='label', y='pred', grid=True, ax=ax[0])
ax[0].axline((1500, 1500), slope=1, color='k')
pd.concat([label, pred], axis=1).plot(grid=True, ax=ax[1])
ax[1].axhline(label.mean(), color='blue');
ax[1].axhline(pred.mean(), color='orange');
print(vs.corr().iloc[0, 1], err)


# In[184]:


X_.plot(subplots=True, layout=(4, 6), figsize=(20, 8), grid=True);


# In[185]:


coefs = pd.Series(model.coef_, index=X_.columns)
coefs.sort_values(ascending=False).plot.bar(figsize=(15, 3), grid=True, title='Features Coefs');


# In[186]:


coefs.sort_values(ascending=False).plot.barh(figsize=(3, 10), grid=True, title='Features Coefficients');


# In[187]:


## SESGOS
vs['diff'] = vs.iloc[:, :2].diff(axis=1).iloc[:, 1]
vs['is_parental'] = lotes.loc[vs.index].isin(var['LOTE parental'].dropna().to_list())
vs['has_parental'] = (~var.loc[lotes.loc[vs.index].values, 'LOTE parental'].isna()).values

fig, ax = plt.subplots(3, 1, figsize=(5, 8), tight_layout=True);
ax = ax.ravel()
vs['diff'].plot.area(stacked=False, title='Error by date', ax=ax[0]);
vs[vs['is_parental']]['diff'].rename('is_parental').plot.kde(ax=ax[1], legend=True)
vs[~vs['is_parental']]['diff'].rename('not_is_parental').plot.kde(title='Error by is_parental', legend=True, ax=ax[1])
vs[vs['has_parental']]['diff'].rename('has_parental').plot.kde(ax=ax[2], legend=True)
vs[~vs['has_parental']]['diff'].rename('not_has_parental').plot.kde(title='Error by has_parental', legend=True, ax=ax[2])
[a.grid() for a in ax];


# In[188]:


X_.columns


# #### HIPERPARÁMETROS

# In[189]:


# Revisión de hiperparámetros de Elastic Net. Buscamos la mejor combinación.
alphas = [x / 10 for x in range(5, 16, 1)]
l1_ratios = [x / 10 for x in range(1, 10, 1)]

XX = {}
XX['imp_pvalues_multi_2_drop'] = z_imputed_features[imp_sel_pvalues_multi_2.difference(to_drop)]

results_h = {}
for a in alphas:
    for l1 in l1_ratios:
        for key, value in XX.items():
            model = ElasticNet(alpha=a, l1_ratio=l1, random_state=random_state)
            cv = cross_validate(model, value, label, scoring=scoring, cv=cv_splitter, return_train_score=True)
            results_h[key + '_' + 'enet_' + str(a) + '_l1_' + str(l1)] = pd.DataFrame(cv)


# In[190]:


m = pd.DataFrame({key: [value['test_score'].mean(), value['test_score'].std(), value['train_score'].mean(), ] for key, value in results_h.items()}, index=['avg', 'std', 'train']).T
m['diff_train_test'] = m['avg'] - m['train']

m.sort_index().plot(subplots=True, layout=(1, 4), figsize=(20, 3))

m.sort_values('avg', ascending=False)


# In[191]:


aux = m[m.index.str[:len('imp_pvalues_multi_2_drop_enet_')] == 'imp_pvalues_multi_2_drop_enet_'].copy()
aux['alpha'] = aux.index.str.split('_').str[-3].astype(float)
aux['l1_ratio'] = aux.index.str.split('_').str[-1].astype(float)
metrics = {x: pd.pivot_table(aux, index='alpha', columns='l1_ratio', values=x) for x in m.columns}
fig, axs = plt.subplots(1, 4, figsize=(15, 4), tight_layout=True)
for metric, ax in zip(m.columns, axs):
    sns.heatmap(metrics[metric], cmap='Spectral', annot=True, fmt='0.0f', cbar=False, ax=ax)


# #### TRAIN TEST CHECK

# In[192]:


# La última X es la que se ha definido con valores imputados y pvalues.
X_train, X_test, y_train, y_test = train_test_split(X_, label, test_size=0.2, random_state=random_state)

model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

error_test = root_mean_squared_error(y_test, y_pred_test)
error_train = root_mean_squared_error(y_train, y_pred_train)
dif = error_test-error_train
print('Error en test: ', error_test)
print('Error en train: ', error_train)
print('Diferencia: ', dif)


# In[ ]:




