Source code for diggrtoolbox.treeexplore.treeexplore

#!/usr/bin/env python3
"""
Getting data structures to work with, sometimes is hard, especially, when you
need to find specific information in nested jsons and no schema is provided,
or the data and its changing fast.

Author: F. Rämisch <raemisch@ub.uni-leipzig.de>
Copyright: 2018, Universitätsbibliothek Leipzig
License: GNU General Public License v3
"""

from ..deepget import deepget


TAB_SYMBOL = "  "


[docs]class TreeExplore: """ TreeExplore provides easy to use methods to explore complex data structures obtained e.g. from online REST-APIs. As data structures behind often grew over the years, the internal structure of these objects to be obtained often is not logical. By providing a full text search and a show method, this tool can be helpful when first investigating, what information is to be found in the data and what is its structure. :Example: >>> import diggrtoolbox as dt >>> test_dict = {'id' : 123456789, >>> 'data' : {'name' : 'diggr project', >>> 'city' : 'Leipzig', >>> 'field': 'Video Game Culture'}, >>> 'references':[{'url' : 'http://diggr.link', >>> 'name' : 'diggr website'}, >>> {'url' : 'http://ub.uni-leipzig.de', >>> 'name' : 'UBL website'}]} >>> tree = dt.TreeExplore(test_dict) >>> results = tree.search("leipzig") Search-Term: leipzig Route: references, 1, url, Embedding: 'http://ub.uni-leipzig.de' >>> print(results) [{'embedding': 'http://ub.uni-leipzig.de', 'route': ['references', 1, 'url'], 'unique_in_embedding': False, 'term': 'leipzig'}] .. note:: Currently the search is case sensitive only! """ def __init__(self, tree, tab_symbol=TAB_SYMBOL): """ Initializes the TreeExplore object. Checks if the provided tree object if either a (nested) dict/list, otherwise a TypeError is raised. :param tree: the object to be explored. :type tree: dict or list :param tab_symbol: symbol used to indent when displaying nested structures :type tab_symbol: str """ if isinstance(tree, dict) or isinstance(tree, list): self.tree = tree self.tab_symbol = tab_symbol else: raise TypeError("This only makes sense for lists and dicts.") def __getitem__(self, route): """ Provides access to the stored tree. By using the deepget method, objects can be accessed by giving the route, a TreeExplore result, or the key/index for any object in the tree. :param route: route, result or key/index :type route: list, dict, str, int, float """ if isinstance(route, list): return deepget(self.tree, route) elif isinstance(route, dict): return deepget(self.tree, route['route']) else: return self.tree[route] def _display(self, value, indent): """ Displays a value at the given indentation level. :param value: the value to be displayed :type value: str, int, float """ print("".join([self.tab_symbol for i in range(indent)]), value)
[docs] def show(self, tree=None, indent=0): """ Visualizes the whole tree. If no tree-like structure (dict/list/both) is given, the self.tree is used. This function is called recursively with the nested subtrees. :param tree: The tree to be shown. :type tree: dict, list :param indent: Current indentation level of this tree :type indent: int """ if not tree: tree = self.tree if isinstance(tree, dict): for key, value in tree.items(): self._display(key, indent) if isinstance(value, dict) or isinstance(value, list): self.show(value, indent+1) elif isinstance(tree, list): self._display(len(tree), indent) else: raise TypeError("Object to must be an instance of dict or list")
[docs] def show_search_result(self, result): """ Displays a search result together with its embedding and path. :param result: the result dict generated by _prepare_search_result :type result: dict """ print("Search-Term: {}".format(result['term'])) print("Route: ", end="") for step in result['route']: print(step, end=", ") print() if 'embedding' in result.keys(): print("Embedding: '{}' ".format(result['embedding']), end="") if result['unique_in_embedding']: print('(unique)') else: print()
def _prepare_search_result(self, term, route, results, embedding=None, show_result=True): """ Prepares the search, appends it to the results list and organizes the printing. """ result = {'term': term, 'route': route} if embedding: unique_in_embedding = False embedding_length = len(embedding) l_occur = 0 r_occur = embedding_length if embedding_length > 50: l_occur = embedding.find(term) r_occur = embedding.rfind(term) if l_occur == r_occur: unique_in_embedding = True if l_occur - 10 < 0: l_occur = 0 else: l_occur -= 10 if r_occur + 10 > embedding_length: r_occur = embedding_length else: r_occur += 10 result['embedding'] = embedding[l_occur:r_occur] result['unique_in_embedding'] = unique_in_embedding results.append(result) if show_result: self.show_search_result(result) return results def _search(self, term, tree=None, route=None, results=None, show_results=True): """ This function provides full text search for nested dicts/lists/both. It will return the path to every occasion of the term. :param term: the term/object to be found in the tree. :type term: str, int, float :param tree: The (sub)tree to be searched, default: None/self.tree :type tree: (nested) dict/list :param route: The route taken to get to this subtree, default: None :type route: list :param results: The result object used to store all occurences. :type results: list :param show_results: Should the results be displayed, default: True :type show_results: bool """ if tree is None: tree = self.tree if results is None: results = [] if route is None: route = [] else: route = route.copy() if isinstance(tree, dict): for key, value in tree.items(): if isinstance(value, dict) or isinstance(value, list): self._search(term, value, route+[key], results, show_results) if isinstance(term, int) or isinstance(term, float): if term == key: results = self._prepare_search_result(term, route+[key], results, None, show_results) continue elif isinstance(term, str): for element in [key, value]: if isinstance(element, str): if term in element: results = self._prepare_search_result(term, route+[key], results, value, show_results) continue else: raise TypeError("Encountered unsupported type at {}".format(route)) elif isinstance(tree, list): for e, element in enumerate(tree): if isinstance(element, dict) or isinstance(element, list): self._search(term, element, route+[e], results, show_results) elif isinstance(element, int) or isinstance(element, float): if term == element: results = self._prepare_search_result(term, route+[e], results, None, show_results) continue elif isinstance(term, str) and isinstance(element, str): if term in element: results = self._prepare_search_result(term, route+[e], results, element, show_results) continue elif isinstance(term, float) or isinstance(term, int): continue elif element is None: continue else: raise TypeError("Encountered unsupported type at {}".format(route)) else: raise TypeError("Not a valid tree to search in.") return results
[docs] def search(self, term): """ Wrapper for the _search function, stripping all the parameters not to be used by the end user. :param term: the term/object to be found in the tree. :type term: str, int, float """ return self._search(term)
[docs] def find(self, term): pass
[docs] def find_key(self, key): """ find_key """ pass
[docs] def find_value(self, value): pass