Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement JSON DataFile serialization #611

Merged
merged 32 commits into from
Dec 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e1984d8
Implement the start of rope.base.serializer
lieryan Dec 16, 2022
b76b245
Implement None <-> null
lieryan Dec 16, 2022
81e2240
Reject unrecognized objects
lieryan Dec 16, 2022
b908b2a
Implement tuple <-> array conversion
lieryan Dec 16, 2022
31108d4
Implement tuple and list <-> array conversion
lieryan Dec 16, 2022
01372f7
Implement dict with string keys <-> object conversion
lieryan Dec 16, 2022
5855838
Implement dict with complex key <-> object conversion
lieryan Dec 16, 2022
f2f65e5
Add test for the shape of encoded data
lieryan Dec 16, 2022
20f1a19
Use single letter type code, for compactness
lieryan Dec 16, 2022
1ac83e3
Add test for version number
lieryan Dec 16, 2022
90c636d
Document the types that can be used for dict keys
lieryan Dec 16, 2022
781c30c
Simplify encoding for dict with non-numeric string key
lieryan Dec 16, 2022
4837927
Split the assertion for better clarity
lieryan Dec 16, 2022
23f10f3
Add a few more corner cases
lieryan Dec 16, 2022
0085530
Add test for complex serialization with references
lieryan Dec 16, 2022
f067b8d
Annotate what can be dict key
lieryan Dec 16, 2022
1e89b0e
Allow None to be dict key
lieryan Dec 16, 2022
d4af818
Reserve special key "$"
lieryan Dec 16, 2022
266f735
New list encoding to use object encoding
lieryan Dec 16, 2022
06d7141
Simplify list encoding
lieryan Dec 16, 2022
d841ad2
Update documentation to reflect current implementation
lieryan Dec 16, 2022
d4e356c
Change ScopeInfo.__getstate__() and __setstate__() to use rope.base.s…
lieryan Dec 16, 2022
e8fbaa5
Added version 2 of the serializer
lieryan Dec 16, 2022
b66d7f5
Implement version 2 serializer
lieryan Dec 16, 2022
08a1d07
Use version 2 serializer for ScopeInfo
lieryan Dec 16, 2022
6fd3739
Don't store "references" if it's empty
lieryan Dec 16, 2022
86c1e14
Improve error handling
lieryan Dec 16, 2022
ffa4851
Write both the pickle and .json version of _DataFile
lieryan Dec 16, 2022
a91948a
Blacken
lieryan Dec 28, 2022
5197389
Merge remote-tracking branch 'origin/master' into lieryan-implement-j…
lieryan Dec 28, 2022
77dcdc2
Update CHANGELOG.md
lieryan Dec 28, 2022
89ee77f
Use ExitStack()
lieryan Dec 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- #626 Install pre-commit hooks on rope repository (@lieryan)
- #548 Implement MoveGlobal using string as destination module names (@lieryan)
- #627 Fix parsing of octal literal (@lieryan)
- #611 Implement JSON DataFile serialization (@lieryan)

# Release 1.6.0

Expand Down
14 changes: 12 additions & 2 deletions rope/base/oi/memorydb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from rope.base import utils
from rope.base.oi import objectdb
from rope.base.serializer import json_to_python, python_to_json


class MemoryDB(objectdb.FileDict):
Expand Down Expand Up @@ -115,7 +116,16 @@ def add_call(self, parameters, returned):
self.call_info[parameters] = returned

def __getstate__(self):
return (self.call_info, self.per_name)
original_data = (self.call_info, self.per_name)
encoded = python_to_json(original_data, version=2)
encoded["$"] = "ScopeInfo"
return encoded

def __setstate__(self, data):
self.call_info, self.per_name = data
if isinstance(data, tuple) and len(data) == 2:
# legacy pickle-based serialization
self.call_info, self.per_name = data
else:
# new serialization
assert data["$"] == "ScopeInfo"
self.call_info, self.per_name = json_to_python(data)
7 changes: 6 additions & 1 deletion rope/base/project.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import os
import sys
import warnings
from contextlib import ExitStack
from typing import Optional

import rope.base.fscommands # Use full qualification for clarity.
Expand Down Expand Up @@ -393,8 +395,11 @@ def read_data(self, name):
def write_data(self, name, data):
if self.project.ropefolder is not None:
file = self._get_file(name)
with open(file.real_path, "wb") as output_file:
with ExitStack() as cm:
output_file = cm.enter_context(open(file.real_path, "wb"))
output_file2 = cm.enter_context(open(file.real_path + ".json", "w"))
pickle.dump(data, output_file, 2)
json.dump(data, output_file2, default=lambda o: o.__getstate__())

def add_write_hook(self, hook):
self.hooks.append(hook)
Expand Down
145 changes: 145 additions & 0 deletions rope/base/serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""
This module serves to convert a data structure composed of Python primitives
(dict, list, tuple, int, str, None) to JSON-serializable primitives (object,
array, number, str, null).

A core feature of this serializer is that the produced will round-trip to
identical objects when deserialized by the standard library json module.
In other words, this property always holds:

>>> original_data = ... any JSON ...
>>> encoded = python_to_json(original_data)
>>> serialized = json.dumps(encoded)
>>> decoded = json.loads(serialized)
>>> rehydrated_data = json_to_python(decoded)

>>> assert rehydrated_data == original_data
>>> assert encoded == decoded

Couple challenges in straight serialization that this module helps resolve:

- json.dumps() maps both Python list and tuple to JSON array. This module
provides two variants:

- In version=1, this module converts Python list `[1, 2, 3]` as-is and
converts Python tuple `(1, 2, 3)` to special object construct
`{"$": "t", "items": [1, 2, 3]}`

- In version=2, it is the other way around, this module converts Python tuple
`(1, 2, 3)` as-is and converts Python list `[1, 2, 3]` to special object
construct `{"$": "l", "items": [1, 2, 3]}`

- Python dict keys can be a tuple/dict, but JSON Object keys must be strings
This module replaces all `dict` keys with `refid` which can be resolved using
the `encoded["references"][refid]` lookup table. Except there's a small
optimisation, if the dict key is a string that isn't only numeric, which is
encoded directly into the object.

- Python dict keys cannot be another dict because it is unhashable, therefore
there's no encoding for having objects as keys either.

- There is currently no support for floating point numbers.

Note that `json_to_python` only accepts Python objects that can be the output
of `python_to_json`, there is NO guarantee for going the other way around. This
may or may not work:

>>> python_to_json(json_to_python(original_data)) == original_data

"""


def python_to_json(o, version=1):
if version not in (1, 2):
raise ValueError(f"Unexpected version {version}")
references = []
result = {
"v": version,
"data": _py2js(o, references, version=version),
"references": references,
}
if not result["references"]:
del result["references"]
return result


def json_to_python(o):
version = o["v"]
if version not in (1, 2):
raise ValueError(f"Unexpected version {version}")
references = o.get("references", {})
data = _js2py(o["data"], references, version)
return data


def _py2js(o, references, version):
if isinstance(o, (str, int)) or o is None:
return o
elif isinstance(o, tuple):
if version == 1:
return {
"$": "t",
"items": [_py2js(item, references, version) for item in o],
}
else:
return [_py2js(item, references, version) for item in o]
elif isinstance(o, list):
if version == 2:
return {
"$": "l",
"items": [_py2js(item, references, version) for item in o],
}
else:
return [_py2js(item, references, version) for item in o]
elif isinstance(o, dict):
result = {}
for pykey, pyvalue in o.items():
if pykey == "$":
raise ValueError('dict cannot contain reserved key "$"')
if isinstance(pykey, str) and not pykey.isdigit():
result[pykey] = _py2js(pyvalue, references, version)
else:
assert isinstance(pykey, (str, int, tuple)) or pykey is None
assert not isinstance(pykey, list)
refid = len(references)
references.append(_py2js(pykey, references, version))
result[str(refid)] = _py2js(pyvalue, references, version)
return result
raise TypeError(f"Object of type {type(o)} is not allowed {o}")


def _js2py(o, references, version):
if isinstance(o, (str, int)) or o is None:
return o
elif isinstance(o, list):
if version == 1:
return list(_js2py(item, references, version) for item in o)
elif version == 2:
return tuple(_js2py(item, references, version) for item in o)
raise ValueError(f"Unexpected version {version}")
elif isinstance(o, dict):
result = {}
if "$" in o:
if o["$"] == "t":
assert version == 1
data = o["items"]
return tuple(_js2py(item, references, version) for item in data)
elif o["$"] == "l":
assert version == 2
data = o["items"]
return list(_js2py(item, references, version) for item in data)
raise TypeError(f'Unrecognized object of type: {o["$"]} {o}')
else:
for refid, jsvalue in o.items():
assert isinstance(refid, str)
if refid.isdigit():
refid = int(refid)
assert 0 <= refid < len(references)
jskey = references[refid]
pyvalue = _js2py(jsvalue, references, version)
pykey = _js2py(jskey, references, version)
result[pykey] = pyvalue
else:
result[refid] = _js2py(jsvalue, references, version)
return result
raise TypeError(f'Object of type "{type(o).__name__}" is not allowed {o}')
54 changes: 54 additions & 0 deletions ropetest/objectdbtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,57 @@ def test_using_file_list_observer(self, db):
db.add_file_list_observer(observer)
db.validate_files()
self.assertEqual("removed invalid ", observer.log)

@_do_for_all_dbs
def test_legacy_serialization(self, db):
import pickle

db.add_callinfo("file", "key", (1, 2), 3)
db.add_pername("file", "key", "name", 1)
scope_info = db._get_scope_info("file", "key")

pickled_data = b'\x80\x04\x95D\x00\x00\x00\x00\x00\x00\x00\x8c\x15rope.base.oi.memorydb\x94\x8c\tScopeInfo\x94\x93\x94)\x81\x94}\x94K\x01K\x02\x86\x94K\x03s}\x94\x8c\x04name\x94K\x01s\x86\x94b.' # noqa

assert pickle.loads(pickled_data).call_info == scope_info.call_info
assert pickle.loads(pickled_data).per_name == scope_info.per_name

@_do_for_all_dbs
def test_new_pickle_serialization(self, db):
import pickle

db.add_callinfo("file", "key", (1, 2), 3)
db.add_pername("file", "key", "name", 1)
scope_info = db._get_scope_info("file", "key")

serialized = pickle.dumps(scope_info)

rehydrated_data = pickle.loads(serialized)
assert rehydrated_data.call_info == scope_info.call_info
assert rehydrated_data.per_name == scope_info.per_name

@_do_for_all_dbs
def test_new_json_serialization(self, db):
import json

from rope.base.oi.memorydb import ScopeInfo

db.add_callinfo("file", "key", (1, 2), 3)
db.add_pername("file", "key", "name", 1)
scope_info = db._get_scope_info("file", "key")

data = {"inside": [scope_info], "other": scope_info, "things": [1, 2, 3]}

def object_hook(o):
if o.get("$") == "ScopeInfo":
new_o = ScopeInfo.__new__(ScopeInfo)
new_o.__setstate__(o)
return new_o
return o

serialized = json.dumps(data, default=lambda o: o.__getstate__())
rehydrated_data = json.loads(serialized, object_hook=object_hook)

rehydrated_scope_info = rehydrated_data["inside"][0]
assert isinstance(rehydrated_scope_info, ScopeInfo)
assert rehydrated_scope_info.call_info == scope_info.call_info
assert rehydrated_scope_info.per_name == scope_info.per_name