#!/usr/bin/env python3
"""
Getting data structures to work with, sometimes is hard, especially, when you
need to find specific information in nested jsons and no schema is provided,
or the data and its changing fast.
Author: F. Rämisch <raemisch@ub.uni-leipzig.de>
Copyright: 2018, Universitätsbibliothek Leipzig
License: GNU General Public License v3
"""
from ..deepget import deepget
TAB_SYMBOL = " "
[docs]class TreeExplore:
"""
TreeExplore provides easy to use methods to explore complex data structures
obtained e.g. from online REST-APIs. As data structures behind often grew
over the years, the internal structure of these objects to be obtained
often is not logical.
By providing a full text search and a show method, this tool can be
helpful when first investigating, what information is to be found in the
data and what is its structure.
:Example:
>>> import diggrtoolbox as dt
>>> test_dict = {'id' : 123456789,
>>> 'data' : {'name' : 'diggr project',
>>> 'city' : 'Leipzig',
>>> 'field': 'Video Game Culture'},
>>> 'references':[{'url' : 'http://diggr.link',
>>> 'name' : 'diggr website'},
>>> {'url' : 'http://ub.uni-leipzig.de',
>>> 'name' : 'UBL website'}]}
>>> tree = dt.TreeExplore(test_dict)
>>> results = tree.search("leipzig")
Search-Term: leipzig
Route: references, 1, url,
Embedding: 'http://ub.uni-leipzig.de'
>>> print(results)
[{'embedding': 'http://ub.uni-leipzig.de',
'route': ['references', 1, 'url'],
'unique_in_embedding': False,
'term': 'leipzig'}]
.. note:: Currently the search is case sensitive only!
"""
def __init__(self, tree, tab_symbol=TAB_SYMBOL):
"""
Initializes the TreeExplore object. Checks if the provided tree object
if either a (nested) dict/list, otherwise a TypeError is raised.
:param tree: the object to be explored.
:type tree: dict or list
:param tab_symbol: symbol used to indent when displaying nested
structures
:type tab_symbol: str
"""
if isinstance(tree, dict) or isinstance(tree, list):
self.tree = tree
self.tab_symbol = tab_symbol
else:
raise TypeError("This only makes sense for lists and dicts.")
def __getitem__(self, route):
"""
Provides access to the stored tree. By using the deepget method,
objects can be accessed by giving the route, a TreeExplore result, or
the key/index for any object in the tree.
:param route: route, result or key/index
:type route: list, dict, str, int, float
"""
if isinstance(route, list):
return deepget(self.tree, route)
elif isinstance(route, dict):
return deepget(self.tree, route['route'])
else:
return self.tree[route]
def _display(self, value, indent):
"""
Displays a value at the given indentation level.
:param value: the value to be displayed
:type value: str, int, float
"""
print("".join([self.tab_symbol for i in range(indent)]), value)
[docs] def show(self, tree=None, indent=0):
"""
Visualizes the whole tree. If no tree-like structure (dict/list/both)
is given, the self.tree is used. This function is called recursively
with the nested subtrees.
:param tree: The tree to be shown.
:type tree: dict, list
:param indent: Current indentation level of this tree
:type indent: int
"""
if not tree:
tree = self.tree
if isinstance(tree, dict):
for key, value in tree.items():
self._display(key, indent)
if isinstance(value, dict) or isinstance(value, list):
self.show(value, indent+1)
elif isinstance(tree, list):
self._display(len(tree), indent)
else:
raise TypeError("Object to must be an instance of dict or list")
[docs] def show_search_result(self, result):
"""
Displays a search result together with its embedding and path.
:param result: the result dict generated by _prepare_search_result
:type result: dict
"""
print("Search-Term: {}".format(result['term']))
print("Route: ", end="")
for step in result['route']:
print(step, end=", ")
print()
if 'embedding' in result.keys():
print("Embedding: '{}' ".format(result['embedding']), end="")
if result['unique_in_embedding']:
print('(unique)')
else:
print()
def _prepare_search_result(self,
term,
route,
results,
embedding=None,
show_result=True):
"""
Prepares the search, appends it to the results list and organizes the
printing.
"""
result = {'term': term,
'route': route}
if embedding:
unique_in_embedding = False
embedding_length = len(embedding)
l_occur = 0
r_occur = embedding_length
if embedding_length > 50:
l_occur = embedding.find(term)
r_occur = embedding.rfind(term)
if l_occur == r_occur:
unique_in_embedding = True
if l_occur - 10 < 0:
l_occur = 0
else:
l_occur -= 10
if r_occur + 10 > embedding_length:
r_occur = embedding_length
else:
r_occur += 10
result['embedding'] = embedding[l_occur:r_occur]
result['unique_in_embedding'] = unique_in_embedding
results.append(result)
if show_result:
self.show_search_result(result)
return results
def _search(self,
term,
tree=None,
route=None,
results=None,
show_results=True):
"""
This function provides full text search for nested dicts/lists/both.
It will return the path to every occasion of the term.
:param term: the term/object to be found in the tree.
:type term: str, int, float
:param tree: The (sub)tree to be searched, default: None/self.tree
:type tree: (nested) dict/list
:param route: The route taken to get to this subtree, default: None
:type route: list
:param results: The result object used to store all occurences.
:type results: list
:param show_results: Should the results be displayed, default: True
:type show_results: bool
"""
if tree is None:
tree = self.tree
if results is None:
results = []
if route is None:
route = []
else:
route = route.copy()
if isinstance(tree, dict):
for key, value in tree.items():
if isinstance(value, dict) or isinstance(value, list):
self._search(term, value, route+[key], results, show_results)
if isinstance(term, int) or isinstance(term, float):
if term == key:
results = self._prepare_search_result(term,
route+[key],
results,
None,
show_results)
continue
elif isinstance(term, str):
for element in [key, value]:
if isinstance(element, str):
if term in element:
results = self._prepare_search_result(term,
route+[key],
results,
value,
show_results)
continue
else:
raise TypeError("Encountered unsupported type at {}".format(route))
elif isinstance(tree, list):
for e, element in enumerate(tree):
if isinstance(element, dict) or isinstance(element, list):
self._search(term, element, route+[e], results, show_results)
elif isinstance(element, int) or isinstance(element, float):
if term == element:
results = self._prepare_search_result(term,
route+[e],
results,
None,
show_results)
continue
elif isinstance(term, str) and isinstance(element, str):
if term in element:
results = self._prepare_search_result(term,
route+[e],
results,
element,
show_results)
continue
elif isinstance(term, float) or isinstance(term, int):
continue
elif element is None:
continue
else:
raise TypeError("Encountered unsupported type at {}".format(route))
else:
raise TypeError("Not a valid tree to search in.")
return results
[docs] def search(self, term):
"""
Wrapper for the _search function, stripping all the parameters not
to be used by the end user.
:param term: the term/object to be found in the tree.
:type term: str, int, float
"""
return self._search(term)
[docs] def quiet_search(self, term):
"""
Wrapper for the _search function to ease access to a nonprinting search
function.
:param term: the term/object to be found in the tree.
:type term: str, int, float
"""
return self._search(term,
tree=None,
route=None,
results=None,
show_results=False)
[docs] def find(self, term):
pass
[docs] def find_key(self, key):
"""
find_key
"""
pass
[docs] def find_value(self, value):
pass