Skip to content

Commit

Permalink
ENH: Add support for reading 110-format Stata dta files (#58044)
Browse files Browse the repository at this point in the history
* ENH: Add support for reading 110-format Stata dta files

* Add whatsnew note to v3.0.0.rst

* Add a test data file containing value labels

* Compare version number inclusively when determining whether to use old or new typlist version

* Add a big-endian version of the test data set
  • Loading branch information
cmjcharlton committed May 8, 2024
1 parent 8d543ba commit d62d77b
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 13 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Expand Up @@ -44,6 +44,8 @@ Other enhancements
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_300.notable_bug_fixes:
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/stata.py
Expand Up @@ -91,7 +91,7 @@

_version_error = (
"Version of given Stata file is {version}. pandas supports importing "
"versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
"versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
"and 119 (Stata 15/16, over 32,767 variables)."
)
Expand Down Expand Up @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int:

def _read_old_header(self, first_char: bytes) -> None:
self._format_version = int(first_char[0])
if self._format_version not in [104, 105, 108, 111, 113, 114, 115]:
if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]:
raise ValueError(_version_error.format(version=self._format_version))
self._set_encoding()
self._byteorder = ">" if self._read_int8() == 0x1 else "<"
Expand All @@ -1408,7 +1408,7 @@ def _read_old_header(self, first_char: bytes) -> None:
self._time_stamp = self._get_time_stamp()

# descriptors
if self._format_version > 108:
if self._format_version >= 111:
typlist = [int(c) for c in self._path_or_buf.read(self._nvar)]
else:
buf = self._path_or_buf.read(self._nvar)
Expand Down
Binary file added pandas/tests/io/data/stata/stata-compat-110.dta
Binary file not shown.
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata4_110.dta
Binary file not shown.
18 changes: 8 additions & 10 deletions pandas/tests/io/test_stata.py
Expand Up @@ -225,11 +225,9 @@ def test_read_dta3(self, file, datapath):

tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize(
"file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
)
def test_read_dta4(self, file, datapath):
file = datapath("io", "data", "stata", f"{file}.dta")
@pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117])
def test_read_dta4(self, version, datapath):
file = datapath("io", "data", "stata", f"stata4_{version}.dta")
parsed = self.read_dta(file)

expected = DataFrame.from_records(
Expand Down Expand Up @@ -271,11 +269,11 @@ def test_read_dta4(self, file, datapath):
# stata doesn't save .category metadata
tm.assert_frame_equal(parsed, expected)

@pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
def test_readold_dta4(self, file, datapath):
@pytest.mark.parametrize("version", [105, 108])
def test_readold_dta4(self, version, datapath):
# This test is the same as test_read_dta4 above except that the columns
# had to be renamed to match the restrictions in older file format
file = datapath("io", "data", "stata", f"{file}.dta")
file = datapath("io", "data", "stata", f"stata4_{version}.dta")
parsed = self.read_dta(file)

expected = DataFrame.from_records(
Expand Down Expand Up @@ -2002,7 +2000,7 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path):
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)


@pytest.mark.parametrize("version", [105, 108, 111, 113, 114])
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114])
def test_backward_compat(version, datapath):
data_base = datapath("io", "data", "stata")
ref = os.path.join(data_base, "stata-compat-118.dta")
Expand All @@ -2012,7 +2010,7 @@ def test_backward_compat(version, datapath):
tm.assert_frame_equal(old_dta, expected, check_dtype=False)


@pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118])
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
def test_bigendian(version, datapath):
ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")
Expand Down

0 comments on commit d62d77b

Please sign in to comment.