Missing Data¶
All of the models can handle missing data. For performance reasons, the default is not to do any checking for missing data. If, however, you would like for missing data to be handled internally, you can do so by using the missing keyword argument. The default is to do nothing
In [1]: import statsmodels.api as sm
ImportErrorTraceback (most recent call last)
<ipython-input-1-085740203b77> in <module>()
----> 1 import statsmodels.api as sm
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/api.py in <module>()
5 from . import regression
6 from .regression.linear_model import OLS, GLS, WLS, GLSAR
----> 7 from .regression.recursive_ls import RecursiveLS
8 from .regression.quantile_regression import QuantReg
9 from .regression.mixed_linear_model import MixedLM
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/regression/recursive_ls.py in <module>()
14 from statsmodels.regression.linear_model import OLS
15 from statsmodels.tools.data import _is_using_pandas
---> 16 from statsmodels.tsa.statespace.mlemodel import (
17 MLEModel, MLEResults, MLEResultsWrapper)
18 from statsmodels.tools.tools import Bunch
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/tsa/statespace/mlemodel.py in <module>()
16 from scipy.stats import norm
17
---> 18 from .simulation_smoother import SimulationSmoother
19 from .kalman_smoother import SmootherResults
20 from .kalman_filter import (INVERT_UNIVARIATE, SOLVE_LU)
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/tsa/statespace/simulation_smoother.py in <module>()
8
9 import numpy as np
---> 10 from .kalman_smoother import KalmanSmoother
11 from . import tools
12
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/tsa/statespace/kalman_smoother.py in <module>()
9 import numpy as np
10
---> 11 from statsmodels.tsa.statespace.representation import OptionWrapper
12 from statsmodels.tsa.statespace.kalman_filter import (KalmanFilter,
13 FilterResults)
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/tsa/statespace/representation.py in <module>()
8
9 import numpy as np
---> 10 from .tools import (
11 find_best_blas_type, validate_matrix_shape, validate_vector_shape
12 )
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/tsa/statespace/tools.py in <module>()
205 'z': _statespace.zcopy_index_vector
206 })
--> 207 set_mode(compatibility=None)
208
209
/builddir/build/BUILD/statsmodels-0.9.0/statsmodels/tsa/statespace/tools.py in set_mode(compatibility)
57 if not compatibility:
58 from scipy.linalg import cython_blas
---> 59 from . import (_representation, _kalman_filter, _kalman_smoother,
60 _simulation_smoother, _tools)
61 compatibility_mode = False
ImportError: cannot import name _representation
In [2]: data = sm.datasets.longley.load()
NameErrorTraceback (most recent call last)
<ipython-input-2-f0fe0de8afb1> in <module>()
----> 1 data = sm.datasets.longley.load()
NameError: name 'sm' is not defined
In [3]: data.exog = sm.add_constant(data.exog)
NameErrorTraceback (most recent call last)
<ipython-input-3-d96db36c0463> in <module>()
----> 1 data.exog = sm.add_constant(data.exog)
NameError: name 'sm' is not defined
# add in some missing data
In [4]: missing_idx = np.array([False] * len(data.endog))
NameErrorTraceback (most recent call last)
<ipython-input-4-39a1dd095303> in <module>()
----> 1 missing_idx = np.array([False] * len(data.endog))
NameError: name 'data' is not defined
In [5]: missing_idx[[4, 10, 15]] = True
NameErrorTraceback (most recent call last)
<ipython-input-5-a87c14eca92b> in <module>()
----> 1 missing_idx[[4, 10, 15]] = True
NameError: name 'missing_idx' is not defined
In [6]: data.endog[missing_idx] = np.nan
NameErrorTraceback (most recent call last)
<ipython-input-6-0be76ba68456> in <module>()
----> 1 data.endog[missing_idx] = np.nan
NameError: name 'data' is not defined
In [7]: ols_model = sm.OLS(data.endog, data.exog)
NameErrorTraceback (most recent call last)
<ipython-input-7-8f5b93cb9205> in <module>()
----> 1 ols_model = sm.OLS(data.endog, data.exog)
NameError: name 'sm' is not defined
In [8]: ols_fit = ols_model.fit()
NameErrorTraceback (most recent call last)
<ipython-input-8-7de84bc10712> in <module>()
----> 1 ols_fit = ols_model.fit()
NameError: name 'ols_model' is not defined
In [9]: print(ols_fit.params)
NameErrorTraceback (most recent call last)
<ipython-input-9-eab44913bc4d> in <module>()
----> 1 print(ols_fit.params)
NameError: name 'ols_fit' is not defined
This silently fails and all of the model parameters are NaN, which is probably not what you expected. If you are not sure whether or not you have missing data you can use missing = ‘raise’. This will raise a MissingDataError during model instantiation if missing data is present so that you know something was wrong in your input data.
In [10]: ols_model = sm.OLS(data.endog, data.exog, missing='raise')
NameErrorTraceback (most recent call last)
<ipython-input-10-5debd60362bf> in <module>()
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='raise')
NameError: name 'sm' is not defined
If you want statsmodels to handle the missing data by dropping the observations, use missing = ‘drop’.
In [11]: ols_model = sm.OLS(data.endog, data.exog, missing='drop')
NameErrorTraceback (most recent call last)
<ipython-input-11-52f11397a385> in <module>()
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='drop')
NameError: name 'sm' is not defined
We are considering adding a configuration framework so that you can set the option with a global setting.