commit 93a3f9e6fe156e36543a5ec09f9a2856c33dd738 Author: Felix Förtsch Date: Wed Feb 18 10:50:24 2026 +0100 snapshot current state before gitea sync diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..59b4466 Binary files /dev/null and b/.DS_Store differ diff --git a/DataVaultGenerator.egg-info/PKG-INFO b/DataVaultGenerator.egg-info/PKG-INFO new file mode 100644 index 0000000..137e6a2 --- /dev/null +++ b/DataVaultGenerator.egg-info/PKG-INFO @@ -0,0 +1,15 @@ +Metadata-Version: 2.1 +Name: DataVaultGenerator +Version: 1.1.5 +Summary: BI Data Vault Generator package +Home-page: https://github.com/... +Author: Christoph Metz +Author-email: metz@bi-web.de +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: BI License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.7 +Description-Content-Type: text/markdown diff --git a/DataVaultGenerator.egg-info/SOURCES.txt b/DataVaultGenerator.egg-info/SOURCES.txt new file mode 100644 index 0000000..a037878 --- /dev/null +++ b/DataVaultGenerator.egg-info/SOURCES.txt @@ -0,0 +1,54 @@ +README.md +setup.py +DataVaultGenerator/Components.py +DataVaultGenerator/Config.py +DataVaultGenerator/Dag.py +DataVaultGenerator/Mapping.py +DataVaultGenerator/Model.py +DataVaultGenerator/__init__.py +DataVaultGenerator/__main__.py +DataVaultGenerator.egg-info/PKG-INFO +DataVaultGenerator.egg-info/SOURCES.txt +DataVaultGenerator.egg-info/dependency_links.txt +DataVaultGenerator.egg-info/entry_points.txt +DataVaultGenerator.egg-info/requires.txt +DataVaultGenerator.egg-info/top_level.txt +DataVaultGenerator/Entities/Bridge.py +DataVaultGenerator/Entities/Composite.py +DataVaultGenerator/Entities/Delivery.py +DataVaultGenerator/Entities/GenericTable.py +DataVaultGenerator/Entities/GenericTask.py +DataVaultGenerator/Entities/GenericTransformation.py +DataVaultGenerator/Entities/Hub.py +DataVaultGenerator/Entities/Interface.py +DataVaultGenerator/Entities/Link.py +DataVaultGenerator/Entities/PIT.py +DataVaultGenerator/Entities/Reference.py +DataVaultGenerator/Entities/Report.py +DataVaultGenerator/Entities/Satellite.py +DataVaultGenerator/Entities/Sourcesystem.py +DataVaultGenerator/Entities/SubDag.py +DataVaultGenerator/Entities/View.py +DataVaultGenerator/schema/config.yaml +DataVaultGenerator/schema/mapping.yaml +DataVaultGenerator/schema/model.yaml +DataVaultGenerator/schema/sys_specification.yaml +DataVaultGenerator/schema/entities/bridge.yaml +DataVaultGenerator/schema/entities/composite.yaml +DataVaultGenerator/schema/entities/delivery.yaml +DataVaultGenerator/schema/entities/generictable.yaml +DataVaultGenerator/schema/entities/generictask.yaml +DataVaultGenerator/schema/entities/generictransformation.yaml +DataVaultGenerator/schema/entities/hub.yaml +DataVaultGenerator/schema/entities/link.yaml +DataVaultGenerator/schema/entities/pit.yaml +DataVaultGenerator/schema/entities/reference.yaml +DataVaultGenerator/schema/entities/report.yaml +DataVaultGenerator/schema/entities/satellite.yaml +DataVaultGenerator/schema/entities/source.yaml +DataVaultGenerator/schema/entities/sourcesystem.yaml +DataVaultGenerator/schema/entities/subdag.yaml +DataVaultGenerator/schema/entities/view.yaml +DataVaultGenerator/schema/registry/attribute.yaml +DataVaultGenerator/schema/registry/attributes.yaml +DataVaultGenerator/schema/registry/dbentity.yaml \ No newline at end of file diff --git a/DataVaultGenerator.egg-info/dependency_links.txt b/DataVaultGenerator.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/DataVaultGenerator.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/DataVaultGenerator.egg-info/entry_points.txt b/DataVaultGenerator.egg-info/entry_points.txt new file mode 100644 index 0000000..73d3109 --- /dev/null +++ b/DataVaultGenerator.egg-info/entry_points.txt @@ -0,0 +1,3 @@ +[console_scripts] +dvgen = DataVaultGenerator.__main__:main + diff --git a/DataVaultGenerator.egg-info/requires.txt b/DataVaultGenerator.egg-info/requires.txt new file mode 100644 index 0000000..ec75232 --- /dev/null +++ b/DataVaultGenerator.egg-info/requires.txt @@ -0,0 +1,4 @@ +pyyaml +jinja2 +cerberus +rich diff --git a/DataVaultGenerator.egg-info/top_level.txt b/DataVaultGenerator.egg-info/top_level.txt new file mode 100644 index 0000000..05b159f --- /dev/null +++ b/DataVaultGenerator.egg-info/top_level.txt @@ -0,0 +1 @@ +DataVaultGenerator diff --git a/DataVaultGenerator/.DS_Store b/DataVaultGenerator/.DS_Store new file mode 100644 index 0000000..3c8bf72 Binary files /dev/null and b/DataVaultGenerator/.DS_Store differ diff --git a/DataVaultGenerator/Components.py b/DataVaultGenerator/Components.py new file mode 100644 index 0000000..aa527f8 --- /dev/null +++ b/DataVaultGenerator/Components.py @@ -0,0 +1,599 @@ +import logging +import re +from shutil import Error +import sys +import collections +from xmlrpc.client import Boolean +from rich.panel import Panel +from rich.tree import Tree +from rich import print + +from jinja2 import TemplateNotFound, UndefinedError + +def add_to_log_tree(tree: Tree, path: tuple, currentindex): + if currentindex < len(path): + child = tree.add(str(path[currentindex])) + currentindex+=1 + return add_to_log_tree(child, path, currentindex) + else: + return tree + +def log(level: int, title: str, path: tuple, message: str, printout: bool = False): + tree=Tree(str(path[0]), highlight=True) + last = add_to_log_tree(tree ,path, 1) + last.add(message) + print(Panel(tree, title="[red]"+str(title), padding=1,title_align="left" )) + + +#def log(level: int, title: str, path: tuple, message: str, printout: bool = False): +# tree=Tree(str(path[0])) +# print(Panel(tree, title="[red]"+str(title), expand=False,padding=1 )) +# line = "" +# line = "----------------------------- " + title + " -----------------------------" +# logging.log(level, line) +# if printout: +# print(line) +# lvl = 0 +# for i in path: +# if lvl == 0: +# line = " " + str(i) +# logging.log(level, line) +# if printout: +# print(line) +# else: +# line = (" " * lvl) + " |-- " + str(i) +# logging.log(level, line) +# if printout: +# print(line) +# lvl = lvl + 1 +# +# line = (" " * lvl) + " |--> " + message +# logging.log(level, line ) +# +# if printout: +# print(line) +# print("------------------------------------------------------------" + "-" * len(title)) + + +class ErrorCollection: + + def __init__(self): + self._errors = [] + + def add(self, title, path, message): + self._errors.append(dict(title = title, path = path, message = message)) + + def append(self, errors): + self._errors += errors._errors + + @property + def count(self) -> int: + return len(self._errors) + + @property + def errors(self) -> list: + return self._errors + + + +class DBEntity: + + def __init__(self, entityname: str, entity, entitydefaults: dict, entitydbdefinition=None): + + self.entity = entity + + self._definition = entitydefaults | (entitydbdefinition or {}) + + self.name = self._definition.get('name', entityname) + self.database = self._definition.get('database', '') + self.schema = self._definition.get('schema', '') + self.filegroup = self._definition.get('filegroup', '') + self.properties = self._definition.get('properties', {}) + + def get_qualifier(self, include_db: Boolean = True) -> str: + """ returns rendered Qualifier""" + return self.entity.model.basetemplates.get('table_qualifier').render(dbentity=self, includeDB=include_db) + + +class DataVaultEntityAttribute(object): + + #TODO: ggf. die Definition der Attribute umstellen von Liste auf dict: + + # attributes: + # - {name: cust_no, type: 'varchar(32)'} + # attributes: + # cust_no: {type: 'varchar(32)'} + # + # Pattern zum Ersetzen in VSCODE: suche: - \{name: (.*), Ersetze durch: $1: { + + + + # __slots__ = ('_definition' + # ,'entity' + # ,'name' + # ,'datatype' + # ,'is_mandatory' + # ,'logicalname' + # ,'description' + # ,'role' + # ,'_ghostvalue' ) + def __init__(self, entity, definition): + """ + + :rtype: object + """ + + #self.id = uuid.uuid4().hex + self._definition = definition + self.entity = entity + self.name = definition.get('name', '') + + #self.datatype = definition.get('type', '') # Old: type="varchar(100)" + + self._type = definition.get('type', '') # New: type='varchar' + self.length = definition.get('length', '') + self.precision = definition.get('precision', '') + self.scale = definition.get('scale', '') + self.default = definition.get('default', '') + + #TODO: self.datatype als property => varchar(100) + # self.type als native type + + self.is_mandatory = definition.get('mandatory', False) + + self.logicalname = definition.get('logicalname', '') #fixme: in schema aufnehmnen + self.description = definition.get('description', '') + + self.role = definition.get('role', + self.entity.model.config.entitydefaults[self.entity.type].get('attribute_role','base')) + + + + self._ghostvalue = definition.get('ghost') + + self.properties = definition.get('props', {}) + + self.order = definition.get('order') + self.is_pii = definition.get('pii', False) + + + + @property + def datatype(self) -> str: # => Full datatype + return self._type + + @property + def ghostvalue(self) -> str: + if not self._ghostvalue: + return self.entity.model.config.datavault.ghostrecord.get(self.native_datatype.lower(), + self.entity.model.config.datavault.ghostrecord.get('other', '')) + else: + return self._ghostvalue + + @property + def native_datatype(self) -> str: + """Returns the native datatype expression. E.g. nvarchar""" + return self.datatype[:self.datatype.find('(')].strip().lower() if self.datatype.find( + '(') != -1 else self.datatype + # IMPROVE: Müsste eigentlich in der Attributdefinition explizit drin stehen. + + @property + def native_datatypelength(self) -> str: + """Returns the native datatype length. E.g. nvarchar(100) -> 100 """ + return self.datatype[self.datatype.find('(') + 1:self.datatype.find(')')].strip().lower() if self.datatype.find('(') != -1 else '' + + # IMPROVE: Müsste eigentlich in der Attributdefinition explizit drin stehen. + + @property + def column_definition(self) -> str: + """Returns the columndefinition, based on the configured template.""" + return self.entity.model.basetemplates.get('column_ddl').render(attribute=self) + + def copy(self, newname: str = ''): + if newname: + copy = DataVaultEntityAttribute(self.entity, self._definition) + copy.name = newname + return copy + else: + return DataVaultEntityAttribute(self.entity, self._definition) + + def validate(self, spec): + errors = ErrorCollection() + if not spec: + return errors + + logging.debug('Validating attribute <%s>',self.name) + + is_valid = False + for datatype, definition in self.entity.model.sys_specifications[spec]['datatypes'].items(): + matches = re.findall(definition.get('pattern'), self._type, re.MULTILINE | re.IGNORECASE) + for m in matches: + is_valid = True + + if not is_valid: + logging.debug('datatype <%s> of attribute <%s> not valid',self._type, self.name) + + errors.add("VALIDATION ERROR", + (self.entity.filename,"Attribute", "<" + self.name + ">"), + f'Datatype <{self._type}> not valid (not matching any pattern in {spec})') + + return errors + + +class DerivedAttribute(DataVaultEntityAttribute): + pass + + +class GeneratorEntity: + def __init__(self, model, filename: str, definition: dict = None): + # logging.info('Creating Entity %s',definition['name']) + self.model = model + self.filename = filename + self._definition = definition + self.id = definition.get('name') + self.name = definition.get('name') + self.type = definition.get('type') + self.subtype = definition.get('subtype', 'base') + self.description = definition.get('description', '') + self.generate = definition.get('generate', 1) + self.extra = definition.get('extra', {}) + self._sql_pre_hook = definition.get('sql_pre_hook', '') + self._sql_post_hook = definition.get('sql_post_hook', '') + + + + @property + def type_display_name(self) -> str: + return self.model.get_types().get(self.type).get('displayname') + + @property + def subtype_display_name(self) -> str: + return self.model.get_subtypes().get(self.subtype).get('displayname') + + def render_template(self, templatefilename: str): + """Renders the entity by a given template an returns the result als string.""" + + try: + template = self.model.templateEnvironment.get_template(templatefilename) + #print( self.model.templateEnvironment.loader.get_source( self.model.templateEnvironment, templatefilename)) + + #checksum = hashlib.md5(str(template).encode()).hexdigest().upper() + output = template.render( + entity=self, + templatename=templatefilename, + templateversion='' + ) + + except TemplateNotFound: + print("") + print(Panel(f"[red]Error while rendering entity-templates[/red]: Template {templatefilename} not found.", title="[red]RENDER ERROR", padding=1,title_align="left" )) + logging.error(f"Template {templatefilename} not found.") + #print(f"Template {templatefilename} not found.") + + sys.exit(2) + except UndefinedError as e: + print("") + logging.error(f"Error while rendering entity {self.name} :") + logging.error(e) + print(f"Error while rendering entity {self.name} :", e) + sys.exit(2) + return output + + @property + def sql_pre_hook(self) -> str: + return self.model.templateEnvironment.from_string(self._sql_pre_hook).render(this=self) + + @property + def sql_post_hook(self) -> str: + return self.model.templateEnvironment.from_string(self._sql_post_hook).render(this=self) + + def get_component_entities(self): + return [] + + +class Layer(GeneratorEntity): + def __init__(self, model, layerid, definition): + GeneratorEntity.__init__(self, model, '', definition) + self.id = layerid + + @property + def database(self) -> str: + return self.model.config.layer.get(self.id).get('defaultdatabaseobject').get('database') + + @property + def schema(self) -> str: + return self.model.config.layer.get(self.id).get('defaultdatabaseobject').get('schema') + + @property + def filegroup(self) -> str: + return self.model.config.layer.get(self.id).get('defaultdatabaseobject').get('filegroup') + + @property + def sys_specification(self) -> str: + return self.model.config.layer.get(self.id).get('sys_specification','') + + + @property + def connection_name(self) -> str: + return self.model.config.layer.get(self.id).get('connectionname','') + + def get_entities(self) -> list: + return {k: v for k, v in self.model.entities.items() if v.layer == self} + + def get_entity_count(self): + return sum(1 for e in self.model.entities.values() if e.layer == self) + + +class DataVaultEntity(GeneratorEntity): + def __init__(self, model, filename: str, definition: dict = None): + GeneratorEntity.__init__(self, model, filename, definition) + # logging.info('Creating Entity %s',definition['name']) + + self._layername = definition.get('layer', self.model.config.entitydefaults[self.type]['layer']) + + self.attributes = collections.OrderedDict() + + excludecommonattribute = definition.get('exclude_commonattributes', []) + commonattributes = (a for a in self.model.config.entitydefaults.get(self.type, {}).get('attributes', []) if a not in excludecommonattribute ) + for attribute_rolename in commonattributes: + attr = DataVaultEntityAttribute(self, self.model.config.commonattributes.get(attribute_rolename)) + attr.role = attribute_rolename + self.add_attribute(attr) + + for attrdef in definition.get('attributes',[]): + self.add_attribute(DataVaultEntityAttribute(self, attrdef)) + + @property + def layer(self) -> Layer: + """Returns the entity layer.""" + return self.model.get_layer(self._layername) + + @property + def dbentity(self) -> DBEntity: + return DBEntity(self.name, + self, + self.model.config.layer.get(self._layername, {}).get('defaultdatabaseobject', {}), + self._definition.get('dbentity')) + + def add_attribute(self, attribute: DataVaultEntityAttribute): + """add an attribute to the entity.""" + self.attributes[attribute.name] = attribute + + def get_attribute(self, name: str) -> DataVaultEntityAttribute: + """get attribute by name.""" + return self.attributes.get(name, self.get_foreign_attribute(name)) + + def get_attributes(self, roles: list = 'all', exclude: list = ()) -> list[DataVaultEntityAttribute]: + """returns a list of attributes for one or more given roles. You can exclude certain attribute-roles""" + if 'all' in roles: + return [a for a in self.attributes.values() if a.role not in exclude] + self.get_foreign_attributes() + elif 'fk' in roles: # IMPROVE: könnte vereinfacht werden, wenn Attributreferenzen bereits aufgelöst wären. + return [a for a in self.attributes.values() if a.role in roles] + self.get_foreign_attributes() + else: + return [a for a in self.attributes.values() if a.role in roles and a.role not in exclude] + + def get_foreign_attribute(self, name: str) -> DataVaultEntityAttribute: + # gibt attribut, dass auf einer verbundenen Entität liegt zurück. Diese Methode ist als Erweiterung für + # getAttribute gedacht und muss von den jeweiligen Entitäten implementiert werden. + pass + + def get_foreign_attributes(self) -> DataVaultEntityAttribute: + # gibt eine Liste von attributen, die auf einer verbundenen Entität liegen zurück. Diese Methode ist als + # Erweiterung für getAttributes gedacht und muss von den jeweiligen Entitäten implementiert werden. + return [] + + def get_role_attribute(self, role: str) -> DataVaultEntityAttribute: + # returns a specific role-attribute (recordsource, loaddate, hashdiff) as configured in config + try: + return self.get_attributes(roles=role)[0] + except IndexError: + return None + + def contains_pii_attributes(self) -> Boolean: + return any([True for a in self.attributes.values() if a.is_pii ]) + + def get_source_entities(self, active_only: Boolean = True): + """returns list of source entities by lookup of the target in the mapping-definition""" + # Hier dürften nur Entities vom type = delivery auftauchen + entities = dict() + + for sourcename, mapping in self.model.get_mappings().items(): + tm = mapping.get_targetmapping_by_target(self.name) # FIXME: hier ggf. auch role-hubs einbeziehen: + if tm: + if active_only and tm.type != 'mappingonly': + entities[sourcename] = self.model.get_entity(sourcename) + elif not active_only: + entities[sourcename] = self.model.get_entity(sourcename) + + return entities + + def validate(self) -> ErrorCollection: + return ErrorCollection() + + def get_component_attributes(self, attributename: str) -> list: + components = [] + for sourcename, mapping in self.model.get_mappings().items(): + tm = mapping.get_targetmapping_by_target(self.name) + if tm: + for am in tm.get_attribute_mappings(): + if am.targetattribute_name == attributename: + srcentity = self.model.get_entity(sourcename) + for ea in am.source.get_expression_attributes().values(): + components.append(dict(attribute=self.get_attribute(attributename), + sourceentity=srcentity, + sourceattribute=ea + )) + + return components + + + +class DataVaultEntityAttributeExpression: #FIXME:wird das so noch benötigt? vgl Mapping > AttributeMappingExpression + # VALUE_IND: + # expression: + # "max(case when {1} = 'IND' then {2} end)" + # components: + # 1: [thp, shortname] + # 2: [tls, value] + + def __init__(self, rule, expression: str = '', components: dict = {}, resulttype: str = ''): + """ + expression => Expression, e.g.: "max(case when {placeholder1} = 'IND' then {placeholder2} end)" + components => dict of list, e.g.: {placeholder1: [tablealias, column], + placeholder2: [tablealias, column]} + """ + + self._expression = expression + self.components = components + self._resulttype = resulttype + self._rule = rule + + @property + def expression(self) -> str: + return self._expression + + def get_components(self) -> dict: + return self.components + + @property + def datatype(self) -> str: + return self._resulttype + + def get_parsed_expression(self) -> str: + parsed_result = self.expression + template = self._rule.entity.model.basetemplates.get('attribute_expression') + + for placeholder, component in self.get_components().items(): + parsed_result = parsed_result.replace('{' + str(placeholder) + '}', template.render(component=component)) + + return parsed_result + + +class DataVaultEntityRule: + def __init__(self, entity, name: str, definition: dict = None): + self.entity = entity + self.name = name + self.expression = DataVaultEntityAttributeExpression(self, + definition.get('expression'), + definition.get('attributes'), + definition.get('resulttype') + ) + + +class MappingSource: + def __init__(self, model, entity): + self.model = model + self.entity = entity + self.name = entity.name + + def get_target_entity_names(self, active_only: Boolean = False) -> list: + entities = [] + if self.name in self.model.mappings: + for m in self.model.mappings.get(self.name).get_targetmappings().values(): + if active_only and m.type != 'mappingonly': + entities.append(m.targetentityname) + else: + entities.append(m.targetentityname) + + return entities + + def get_target_entities(self) -> list: + """returns list of direct mapped entites""" + if self.name in self.model.mappings: + return self.model.mappings.get(self.name).get_target_entities() + return [] + + def get_target_entity_hash_components(self, target) -> list: + """Get the components for a target-entities hashkey. Since there a different naming for each source, use the attribute names of the source""" + + hashcomponents = [] + + attributemappings = self.get_attribute_mappings_by_target(target.name) # stg -> einzelne entity + + for am in attributemappings: + hashcomponents.append({'sourceexpression': am.source, + 'targetattribute': am.target + }) + + # FIXME: bkcc-attribute sollte immer am Anfang stehen. + + return hashcomponents + + def get_target_entities_hash_components(self) -> dict: + + hash_keys = {} + targets = self.get_target_entities() # Direct mapped Targets + + for target in targets: + + if target.type in ["hub", "reference"]: #TODO: reference ist hier eine besonderheit, da der Key zwar nicht gehashed wird, aber genau dadurch "anfällig" für umbennenung von Quelle zu ziel ist. + hash_keys[target.key_attribute.name] = dict(hashattribute=target.key_attribute, + components=self.get_target_entity_hash_components(target), + targetentity=target) + + if target.type in ["link"]: + # Für den Link: attribute aus mapping + linked entity attribute aus mapping + linkhashcomponents = [] + for le in target.get_linked_entities(): + linkhashcomponents.extend(self.get_target_entity_hash_components(le)) + + linkhashcomponents.extend(self.get_target_entity_hash_components(target)) + + hash_keys[target.key_attribute.name] = dict(hashattribute=target.key_attribute, + components=linkhashcomponents, + targetentity=target) + + if target.type in ["satellite"]: + hash_keys[target.hashdiff_fk_attribute.name] = dict(hashattribute=target.hashdiff_fk_attribute, + components=self.get_target_entity_hash_components( + target), + targetentity=target) + # Wenn der referenzierte Hub bzw. Link-Key noch nicht in der Liste steht + if target.get_parent_key_attribute().name not in hash_keys: + hash_keys[target.get_parent_key_attribute().name] = { + "hashattribute": target.get_parent_key_attribute(), "components": []} + + return hash_keys + + def get_target_entities_hash_attributes(self) -> dict: + attributes = dict() + targets = self.get_target_entities() + + for target in targets: + if target.type in ["hub", "link"]: + attributes[target.key_attribute.name] = target.key_attribute + + # if target.type in ["link"]: #BUG: Dieser mechanismuss sorgt dafür, dass die reihenfolge vom mapping abweicht. + # for le in target.getLinkedEntities(): + # attributes[le.key_attribute.name] = le.key_attribute + + if target.type in ["satellite"]: + attributes[target.hashdiff_fk_attribute.name] = target.hashdiff_fk_attribute + # Wenn der referenzierte Hub bzw. Link-Key noch nicht in der Liste steht + if target.get_parent_key_attribute().name not in attributes: + attributes[target.get_parent_key_attribute().name] = target.get_parent_key_attribute() + + return attributes + + def get_mappings(self) -> dict: + """return a dict of mappings, specified for the entity.""" + return self.model.get_mapping(self.name) + + def get_attribute_mappings_by_target(self, target_entity_name: str) -> dict: + """return a dict of source to target attribute-mappings """ + return self.get_mappings().get_attribute_mappings_by_target(target_entity_name) + +class DynamicProperties(object): + + @classmethod + def from_kwargs(cls, **kwargs): + obj = cls() + for (field, value) in kwargs.items(): + setattr(obj, field, value) + return obj + + def __getattr__(self, attr): + return None + +#TODO: New Feature: Option zum Exportieren einer Objektinstanz nach yaml + diff --git a/DataVaultGenerator/Config.py b/DataVaultGenerator/Config.py new file mode 100644 index 0000000..3ac758c --- /dev/null +++ b/DataVaultGenerator/Config.py @@ -0,0 +1,105 @@ +import logging +from pathlib import Path +import sys +import yaml + +class ConfigDict(dict): + def __getattr__(self, item): + return super().__getitem__(item) + + def __setattr__(self, item, value): + return super().__setitem__(item, value) + + def __dir__(self): + return super().__dir__() + [str(k) for k in self.keys()] + + def __init__(self, initialvalues: dict = {}): + self.update(initialvalues) + +class Config(ConfigDict): + def __dir__(self): + return super().__dir__() + [str(k) for k in self.keys()] + + def __init__(self): + self['model'] = ConfigDict() + self['paths'] = ConfigDict() + self['datavault'] = ConfigDict() + self['generator'] = ConfigDict() + self['layer'] = ConfigDict() + self['vars'] = ConfigDict() + self['pre_hooks'] = ConfigDict() + self['post_hooks'] = ConfigDict() + self['commonattributes'] = ConfigDict() + self['entitydefaults'] = ConfigDict() + self['basetemplates'] = ConfigDict() + self['jinja'] = ConfigDict() + self['sys_specification'] = ConfigDict() + + def load(self, filename, schema, validation_handler): + self.filename = filename + self.path = Path(filename).absolute().parent + + try: + with open(filename, 'r') as file: + self.content = yaml.safe_load(file) + + validation_handler('Configuration: ' + filename, schema, self.content) + + except FileNotFoundError as e: + print("") + print(e) + logging.error(e) + sys.exit(2) + except yaml.scanner.ScannerError as e: + print("") + logging.error(e) + sys.exit(2) + except yaml.parser.ParserError as e: + print("") + logging.error(e) + sys.exit(2) + + self.model.update(dict(name = self.content.get('model').get('name') + , ignore_file_prefix = self.content.get('model').get('ignore_file_prefix','_'))) + + self.paths.update(dict(log = self.content.get('model').get('paths').get('log') + , entities = self.content.get('model').get('paths').get('entities') + , mappings = self.content.get('model').get('paths').get('mappings') + , templates = self.content.get('model').get('paths').get('templates') + , output = self.content.get('model').get('paths').get('output') + )) + + self.vars.update(self.content.get('vars', {})) + + self.pre_hooks.update(self.content.get('pre_hooks', {})) + + self.post_hooks.update(self.content.get('post_hooks', {})) + + self.datavault.update(dict(keyattribute = self.content.get('keyattribute') + , zerokey = self.content.get('zerokey','') + , constraints = ConfigDict(self.content.get('constraints', {})) + , hash = ConfigDict(dict(algorithm = self.content.get('hash_algorithm') + ,separator = self.content.get('hash_separator') + ,case = self.content.get('hash_case'))) + , business_key_treatment = ConfigDict(self.content.get('business_key_treatment')) + , hashdiff_attribute_treatment = ConfigDict(self.content.get('hashdiff_attribute_treatment')) + , ghostrecord = ConfigDict(self.content.get('ghostrecord')) + )) + + self.commonattributes.update(self.content.get('commonattributes')) + + self.basetemplates.update(self.content.get('templates')) + + self.jinja.update(dict(environment = ConfigDict(self.content.get('jinja', {}).get('environment', {})))) + + if self.content.get('sys_specification'): + self.sys_specification.update(self.content.get('sys_specification')) + + for k,v in self.content.get('layer').items(): + self.layer[k] = ConfigDict(v) + + for k,v in self.content.get('entitydefaults').items(): + self.entitydefaults[k] = ConfigDict(v) + + for k,v in self.content.get('generator').items(): + self.generator[k] = ConfigDict(v) \ No newline at end of file diff --git a/DataVaultGenerator/Dag.py b/DataVaultGenerator/Dag.py new file mode 100644 index 0000000..65ae3b3 --- /dev/null +++ b/DataVaultGenerator/Dag.py @@ -0,0 +1,102 @@ +class DagNode: + def __init__(self, name, entity): + self.name = name + self.entity = entity + #self.order = None + self.level = 0 + self._visitedby = list() + + def __repr__(self): + return "<" + self.name + ">" + +# TODO: Gewichtung von Edges: Bspw. verweist ein Satellite zwar auf einen Hub - ist aber nicht zwingend auf dessen vorheriges Laden angewiesen. +# Daher könnte man über einen Gewichtung der Edges von 0 - Loose bis 1 - strict das Level steuern +# stage -> sat : strict +# hub -> sat : loose +# hub, sat -> pit : strict + +class Dag: + def __init__(self, model): + self.model = model + self.nodes = dict() + self.edges = list() + + def reset(self): + for n in self.nodes.values(): + n.level = 0 + + def add_node(self, node: DagNode): + self.nodes[node.name] = node + + def add_edge(self, edge: tuple): + self.edges.append(edge) + + def get_node(self, name): + return self.nodes.get(name) + + def get_successors(self, nodename): + return [self.nodes.get(e[1]) for e in self.edges if e[0] == nodename] + + def get_predecessor(self, nodename): + return [self.nodes.get(e[0]) for e in self.edges if e[1] == nodename] + + def get_roots(self): + return [n for n in self.nodes.values() if len(self.get_predecessor(n.name)) == 0] + + def get_forward_tree(self, node: DagNode, excludes=None, level=1, result = None): + if result is None: + result = [] + + if excludes is None: + excludes = [] + + if node.name not in excludes: + node.level = level + + if node not in result: + result.append(node) + + for sn in self.get_successors(node.name): + if sn.level <= level: + sn.level = level + 1 + + result = self.get_forward_tree(sn, excludes, sn.level, result) + + return result + + def get_backward_tree(self, node: DagNode, excludes=None, level=0, result = None): + if result is None: + result = [] + + if excludes is None: + excludes = [] + + if node.name not in excludes: + node.level = level + + if node not in result: + result.append(node) + + for sn in self.get_predecessor(node.name): + if sn.level >= level: + sn.level = level - 1 + + result = self.get_backward_tree(sn,excludes,sn.level, result) + + return result + + def num_level(self, nodes:list) -> int: + return len(set([n.level for n in nodes])) + + def reverse_level(self, nodes:list) -> list: + num_level = self.num_level(nodes) + for n in nodes: + n.level = num_level + n.level + return nodes + + + + #def get_fastestPath(self): + # returns fastest Path to each target. + # Priority: fastest load of source to target. + diff --git a/DataVaultGenerator/Entities/Bridge.py b/DataVaultGenerator/Entities/Bridge.py new file mode 100644 index 0000000..44e32d9 --- /dev/null +++ b/DataVaultGenerator/Entities/Bridge.py @@ -0,0 +1,82 @@ +import re +from DataVaultGenerator.Components import DataVaultEntity, DataVaultEntityAttribute, ErrorCollection + + +class Bridge(DataVaultEntity): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + self.updatemode = self._definition.get('updatemode', 'full') + + @property + def snapshotattribute(self): + return DataVaultEntityAttribute(self, self._definition['snapshotattribute']) + + @property + def snapshotquery(self, include_db=True): + parsed_result = self.rawquery + + for alias, entity in self.get_query_entities().items(): + if entity: + replacement = self.model.basetemplates.get('query_entity_alias').render(entity=entity, includeDB=include_db, alias=str(alias)) + parsed_result = parsed_result.replace('{' + str(entity.name) + ':' + str(alias) + '}', replacement) + parsed_result = parsed_result.replace('{' + str(entity.name) + '}', replacement) + + return parsed_result + + def get_query_entities(self): + """ Parses Querystrings like: Select * from {entityname1:alias1} join {entityname2:alias2} and returns a list + of entity instances. """ + regex = r"\{(.*?):(.*?)?\}" + + entities = {} + + matches = re.finditer(regex, self.rawquery, re.MULTILINE) + for matchNum, match in enumerate(matches): + for groupNum in range(0, len(match.groups())): + entities[match.group(2)] = self.model.get_entity(match.group(1)) + + #regex101.com + regex = r"\{(.[^:]*?)\}" + matches = re.finditer(regex, self.rawquery, re.MULTILINE) + for matchNum, match in enumerate(matches): + for groupNum in range(0, len(match.groups())): + entities[match.group(1)] = self.model.get_entity(match.group(1)) + + return entities + + @property + def rawquery(self): + return self._definition.get('snapshotquery', '') + + def get_linked_entities(self): #FIXME: in docu aufnehmen?, überhaupt relevant? + """returns a list of linked entities.""" + return [self.model.get_entity(le) for le in self._definition['hubs'] + self._definition.get('links',[])] + + + def has_attributes(self): + return True if self._definition.get('bridgeattributes') else False + + def get_bridgeattributes(self): + return [DataVaultEntityAttribute(self, attrdef) for attrdef in self._definition.get('bridgeattributes', [])] + + def validate(self): + + errors = ErrorCollection() + + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + # Validating entity references: + if self._definition.get('snapshotquery'): + for name, e in self.get_query_entities().items(): + if e is None: + errors.add("VALIDATION ERROR", + (self.filename, "Bridge", "<" + self.name + ">"), + f'query-entity <{name}> not found.') + return errors + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in self.get_query_entities().values() if self != c ] + \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Composite.py b/DataVaultGenerator/Entities/Composite.py new file mode 100644 index 0000000..30f7c41 --- /dev/null +++ b/DataVaultGenerator/Entities/Composite.py @@ -0,0 +1,30 @@ +from DataVaultGenerator.Components import DataVaultEntity, ErrorCollection, MappingSource + + +class Composite(DataVaultEntity, MappingSource): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + MappingSource.__init__(self, model, self) + + @property + def query(self): + return self.model.get_parsed_query(self, self.rawquery) + + def get_query_entities(self): + return self.model.get_query_entities(self.rawquery) + + @property + def rawquery(self): + return self._definition.get('query', '') + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in self.get_query_entities().values()] + + def validate(self): + + errors = ErrorCollection() + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + return errors \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Delivery.py b/DataVaultGenerator/Entities/Delivery.py new file mode 100644 index 0000000..6d72226 --- /dev/null +++ b/DataVaultGenerator/Entities/Delivery.py @@ -0,0 +1,89 @@ +from DataVaultGenerator.Components import DataVaultEntity, MappingSource, ErrorCollection + + +class Delivery(DataVaultEntity, MappingSource): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + MappingSource.__init__(self, model, self) + + self.properties = definition.get('properties', {}) + + @property + def delta_attribute(self): + return self.get_attribute(self._definition.get('deltaattribute')) + + @property + def delta_initialvalue(self): + return self._definition.get('deltainitialvalue') + + @property + def recordsource(self): + return self._definition.get('recordsource', '') + + @property + def batchmode(self): + return self._definition.get('batchmode', 'single') # multi, single + + @property + def deliverymode(self): + return 'delta' if self._definition.get('deltaattribute') else 'full' # multi, single + + @property + def interfaces(self): + return [self.model.get_interface(i) for i in self._definition['interfaces']] + + @property + def ldts_source(self): + return self.get_attribute(self._definition.get('ldts_source')) + + @property + def overwrite_ldts(self): + return True if self._definition.get('ldts_source') else False + + @property + def query(self): + return self._definition.get('query', '') + + @property + def source_system(self): + if self._definition.get('sourcesystem'): + return self.model.get_source_system(self._definition.get('sourcesystem')) + else: + return self.interfaces[0].source_system + + @property + def source_type(self): + return self._definition.get('sourcetype',self.interfaces[0].source_type) + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in + self.interfaces] + + def validate(self): + + errors = ErrorCollection() + + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + # Validating entity references: + for i in self._definition['interfaces']: + if self.model.get_interface(i) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Delivery", "<" + self.name + ">"), + f'Interface <{i}> not found') + + if self._definition.get('deltaattribute'): + if self.delta_attribute is None: + errors.add("VALIDATION ERROR", + (self.filename, "Delivery", "<" + self.name + ">"), + f'Deltaattribute <{self._definition.get("deltaattribute")}> not found in attributes.') + + if self._definition.get('ldts_source'): + if self.ldts_source is None: + errors.add("VALIDATION ERROR", + (self.filename, "Delivery", "<" + self.name + ">"), + f'ldts_source <{self._definition.get("ldts_source")}> not found in attributes.') + + return errors diff --git a/DataVaultGenerator/Entities/GenericTable.py b/DataVaultGenerator/Entities/GenericTable.py new file mode 100644 index 0000000..8382ff9 --- /dev/null +++ b/DataVaultGenerator/Entities/GenericTable.py @@ -0,0 +1,22 @@ +from DataVaultGenerator.Components import DataVaultEntity, ErrorCollection + + +class GenericTable(DataVaultEntity): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + def get_component_entities(self): + c = [{'entity': self, 'component': c, 'type': c.type} + for c in self.model.get_entities_by_type('generictransformation') if self in c.get_target_entities() + ] + #FIXME: Um GenericTask erweitern + return c + + def validate(self): + + errors = ErrorCollection() + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + return errors \ No newline at end of file diff --git a/DataVaultGenerator/Entities/GenericTask.py b/DataVaultGenerator/Entities/GenericTask.py new file mode 100644 index 0000000..af71725 --- /dev/null +++ b/DataVaultGenerator/Entities/GenericTask.py @@ -0,0 +1,45 @@ +from DataVaultGenerator.Components import ErrorCollection, GeneratorEntity, DBEntity + + +class GenericTask(GeneratorEntity): + def __init__(self, model, filename, definition: dict = None): + GeneratorEntity.__init__(self, model, filename, definition) + + self._layername = definition.get('layer', self.model.config.entitydefaults[self.type]['layer']) + + + @property + def layer(self): + """Returns the entity layer.""" + return self.model.get_layer(self._layername) + + def validate(self): + + errors = ErrorCollection() + + # Validating entity references: + + for e in self._definition.get('sources'): + if self.model.get_entity(e) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Generic Task", "<" + self.name + ">"), + f'source <{e}> not found') + + for e in self._definition.get('targets'): + if self.model.get_entity(e) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Generic Task", "<" + self.name + ">"), + f'target <{e}> not found') + + return errors + + def get_source_entities(self): + """returns a list of linked entities.""" + return [self.model.get_entity(e) for e in self._definition['sources']] + + def get_target_entities(self): + """returns a list of linked entities.""" + return [self.model.get_entity(e) for e in self._definition['targets']] + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in self.get_source_entities()] diff --git a/DataVaultGenerator/Entities/GenericTransformation.py b/DataVaultGenerator/Entities/GenericTransformation.py new file mode 100644 index 0000000..978c961 --- /dev/null +++ b/DataVaultGenerator/Entities/GenericTransformation.py @@ -0,0 +1,77 @@ +from DataVaultGenerator.Components import ErrorCollection, GeneratorEntity, DBEntity + + +class GenericTransformation(GeneratorEntity): + def __init__(self, model, filename, definition: dict = None): + GeneratorEntity.__init__(self, model, filename, definition) + + self._layername = definition.get('layer', self.model.config.entitydefaults[self.type]['layer']) + + @property + def dbentity(self): + return DBEntity(self.name, + self, + self.model.config.layer.get(self._layername).get('defaultdatabaseobject'), + self._definition.get('dbentity')) + + @property + def layer(self): + """Returns the entity layer.""" + return self.model.get_layer(self._layername) + + def validate(self): + + errors = ErrorCollection() + + # Validating entity references: + + for e in self._definition.get('sources'): + if self.model.get_entity(e) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Generic Transformation", "<" + self.name + ">"), + f'source <{e}> not found') + + for e in self._definition.get('targets'): + if self.model.get_entity(e) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Generic Transformation", "<" + self.name + ">"), + f'target <{e}> not found') + + for name, e in self.get_query_entities().items(): + if e is None: + errors.add("VALIDATION ERROR", + (self.filename, "Generic Transformation", "<" + self.name + ">"), + f'query-entity <{name}> not found.') + + for name, e in self.get_query_entities().items(): + if name not in self._definition.get('sources') and name not in self._definition.get('targets'): + errors.add("VALIDATION ERROR", + (self.filename, "Generic Transformation", "<" + self.name + ">"), + f'query-entity <{name}> not specified as source or target.') + + return errors + + def get_source_entities(self): + """returns a list of linked entities.""" + return [self.model.get_entity(h) for h in self._definition['sources']] + + def get_target_entities(self): + """returns a list of linked entities.""" + return [self.model.get_entity(h) for h in self._definition['targets']] + + @property + def query(self): + return self.model.get_parsed_query(self, self.rawquery) + + def get_query_entities(self): + return self.model.get_query_entities(self.rawquery) + + @property + def rawquery(self): + return self._definition.get('query', '') + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in self.get_source_entities()] + + def get_attributes(self, roles=(), exclude=()): + return [] \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Hub.py b/DataVaultGenerator/Entities/Hub.py new file mode 100644 index 0000000..881be60 --- /dev/null +++ b/DataVaultGenerator/Entities/Hub.py @@ -0,0 +1,84 @@ +from DataVaultGenerator.Components import DataVaultEntity, DataVaultEntityAttribute, ErrorCollection + + +class Hub(DataVaultEntity): + + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + self.isCaseSensitive = definition.get('caseSesitive', False) + self.role_of = definition.get('roleof') + self.bkcc_attribute = definition.get('bkcc_attribute') + + if self.role_of: + self.generate = 0 + + key = DataVaultEntityAttribute(self, self.model.config.datavault.keyattribute) + key.name = self.key_columnname + self.add_attribute(key) + + @property + def key_columnname(self): + """returns name of the primary Key Attribute. If no name was defined in its definition, a template applies.""" + return self._definition.get('key', self.model.basetemplates.get('entity_key_name').render(entity=self)) + + @property + def hash_attribute_trim(self): + return self._definition.get('key_treatment' + , self.model.config.datavault.business_key_treatment)\ + .get('trim', + self.model.config.datavault.business_key_treatment.trim) + @property + def hash_attribute_case(self): + return self._definition.get('key_treatment' + , self.model.config.datavault.business_key_treatment)\ + .get('case', + self.model.config.datavault.business_key_treatment.case) + @property + def key_attribute(self): + return self.get_role_attribute(self.model.config.datavault.keyattribute.get('role','sk')) + + def get_satellites(self): + return [e for e in self.model.get_entities_by_type('satellite') if e.get_parent_entity() == self] + + def get_component_entities(self): + c = [{'entity': self, 'component': c, 'type': c.type} for c in + self.get_source_entities().values()] # holt derzeit nur die Deliveries über die Mappings + if self.role_of: + c.extend( + [{'entity': self, 'component': self.model.get_entity(self.role_of), 'type': 'hub'}] + ) + return c + + def validate(self): + + errors = ErrorCollection() + + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + # Validating entity references: + + # role-of-reference + + if self.role_of and self.model.get_entity(self.role_of) is None: + errors.add("VALIDATION ERROR", + (self.filename,"Hub", "<" + self.name + ">"), + f'role-of Hub <{self.role_of}> not found.') + + # constraints: + enforce_bk_type = self.model.config.datavault.constraints.get('enforce_bk_type') + if enforce_bk_type: + for attr in self.get_attributes('base'): + if attr.native_datatype not in enforce_bk_type: + errors.add("VALIDATION ERROR", + (self.filename,"Hub", "<" + self.name + ">"), + f'Datatype of attribute <{attr.name}> not valid (enforced: {enforce_bk_type})') + + return errors + + def get_roles(self): + """returns a list of hubs with this hub as role_of-target""" + + return [e for e in self.model.get_entities_by_type('hub', generatable_only=False) if e.role_of == self.name ] \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Interface.py b/DataVaultGenerator/Entities/Interface.py new file mode 100644 index 0000000..3703efd --- /dev/null +++ b/DataVaultGenerator/Entities/Interface.py @@ -0,0 +1,40 @@ +from DataVaultGenerator.Components import DataVaultEntity, DynamicProperties, ErrorCollection + + +class Interface(DataVaultEntity): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + self.properties = definition.get('properties', {}) + + self.prop = DynamicProperties.from_kwargs(**definition.get('properties', {})) + + + @property + def source_type(self): + return self._definition.get('sourcetype') + + @property + def source_system(self): + return self.model.get_source_system(self._definition.get('sourcesystem')) + + def get_component_entities(self): + return [{'entity': self, 'component': self.source_system, + 'type': self.source_system.type}] + + def validate(self): + + errors = ErrorCollection() + + # Validating sourcesystem: + sourcesystem = self._definition.get('sourcesystem') + if self.model.get_source_system(sourcesystem) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Interface", "<" + self.name + ">"), + f'Sourcesystem <{sourcesystem}> not found') + + for attr in self.attributes.values(): + spec = self.source_system.sys_specification + errors.append(attr.validate(spec)) + + return errors diff --git a/DataVaultGenerator/Entities/Link.py b/DataVaultGenerator/Entities/Link.py new file mode 100644 index 0000000..d065ee4 --- /dev/null +++ b/DataVaultGenerator/Entities/Link.py @@ -0,0 +1,79 @@ +from DataVaultGenerator.Components import DataVaultEntity, DataVaultEntityAttribute, ErrorCollection + + +class Link(DataVaultEntity): + + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + self.drivingkeys = self._definition.get('drivingkeys',[]) + key = DataVaultEntityAttribute(self, self.model.config.datavault.keyattribute) + key.name = self.key_columnname + self.add_attribute(key) + + def get_drivingkey_entities(self): + return [self.model.get_entity(d) for d in self.drivingkeys] + + def get_foreign_attribute(self, name: str): + for e in self._definition['hubs']: + if self.model.get_entity(e).key_attribute.name == name: + return self.model.get_entity(e).key_attribute + + def get_foreign_attributes(self): + fa = [] + for e in self._definition['hubs']: + fa.append(DataVaultEntityAttribute(self, definition=dict(name=self.model.get_entity(e).key_attribute.name, + datatype=self.model.get_entity(e).key_attribute.datatype, + role='fk') + )) + return fa + + @property + def key_columnname(self): + """returns name of the primary Key Attribute. If no name was defined in its definition, a template applies.""" + return self._definition.get('key', self.model.basetemplates.get('entity_key_name').render(entity=self)) + + @property + def key_attribute(self): + return self.get_role_attribute(self.model.config.datavault.keyattribute.get('role','sk')) + + def get_linked_entities(self): + """returns a list of linked entities.""" + return [self.model.get_entity(le) for le in self._definition['hubs'] + self._definition.get('links',[])] + + def get_satellites(self): + return [e for e in self.model.get_entities_by_type('satellite') if e.get_parent_entity() == self] + + def validate(self): + + errors = ErrorCollection() + + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + # Validating entity references: + for name in self._definition['hubs']: + if self.model.get_entity(name) is None: + suggest = self.model.get_entity_name_suggestion('hub', name) + suggest = f'Do you mean <{suggest}>?' if suggest else '' + errors.add("VALIDATION ERROR", + (self.filename,"Link", "<" + self.name + ">"), + f'Hub <{name}> not found. ' + suggest) + + + + # Validating entity references: + for name in self._definition.get('links',[]): + if self.model.get_entity(name) is None: + suggest = self.model.get_entity_name_suggestion('link', name) + suggest = f'Do you mean <{suggest}>?' if suggest else '' + errors.add("VALIDATION ERROR", + (self.filename,"Link", "<" + self.name + ">"), + f'Link <{name}> not found. ' + suggest) + + return errors + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in + self.get_source_entities().values()] # holt derzeit nur die Deliveries über die Mappings diff --git a/DataVaultGenerator/Entities/PIT.py b/DataVaultGenerator/Entities/PIT.py new file mode 100644 index 0000000..599953f --- /dev/null +++ b/DataVaultGenerator/Entities/PIT.py @@ -0,0 +1,88 @@ +from DataVaultGenerator.Components import DataVaultEntity, DataVaultEntityAttribute, ErrorCollection + + +class PIT(DataVaultEntity): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + self.baseentity = self._definition.get('baseentity') + self.snapshotmode = self._definition.get('snapshotmode') + self.snapshottable = self._definition.get('snapshottable') + self.snapshottableattribute = self._definition.get('snapshottableattribute') + + + def include_ledts(self): + return self._definition.get('include_ledts') + + @property + def snapshotattribute(self): + return DataVaultEntityAttribute(self, self._definition['snapshotattribute']) + + @property + def snapshotquery(self): + return self.model.get_parsed_query(self, self.rawsnapshotquery) + + def get_snaphotquery_entities(self): + return self.model.get_query_entities(self.rawsnapshotquery) + + @property + def rawsnapshotquery(self): + return self._definition.get('snapshotquery', '') + + @property + def query(self): + return self.model.get_parsed_query(self, self.query) + + def get_query_entities(self): + return self.model.get_query_entities(self.rawquery) + + @property + def rawquery(self): + return self._definition.get('query', '') + + def get_base_entity(self): + return self.model.get_entity(self.baseentity) + + def get_satellites(self): + return [self.model.get_entity(sat) for sat in self._definition.get('satellites')] + + def has_attributes(self): + return True if self._definition.get('pitattributes') else False + + def get_pitattributes(self): + attributes = [] + + for attr in self._definition.get('pitattributes', []): + attributes.append(self.model.get_entity(attr[0]).get_attribute(attr[1])) + # attributes.append({'attribute': self.model.get_entity(attr[1]).get_attribute(attr[2]) + # , 'alias': attr[0]}) + return attributes + + def validate(self): + + errors = ErrorCollection() + + # Validating entity references: + if self._definition.get('snapshotquery'): + for name, e in self.get_snaphotquery_entities().items(): + if e is None: + errors.add("VALIDATION ERROR", + (self.filename, "PIT", "<" + self.name + ">"), + f'query-entity <{name}> not found.') + + if self._definition.get('query'): + for name, e in self.get_query_entities().items(): + if e is None: + errors.add("VALIDATION ERROR", + (self.filename, "PIT", "<" + self.name + ">"), + f'query-entity <{name}> not found.') + + return errors + + def get_component_entities(self): + c = [{'entity': self, 'component': c, 'type': c.type} for c in self.get_snaphotquery_entities().values()] + c.extend([{'entity': self, 'component': c, 'type': c.type} for c in self.get_query_entities().values()]) + c.extend([{'entity': self, 'component': c, 'type': c.type} for c in self.get_satellites()]) + c.extend([{'entity': self, 'component': self.get_base_entity(), 'type': self.get_base_entity().type}]) + + return c \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Reference.py b/DataVaultGenerator/Entities/Reference.py new file mode 100644 index 0000000..317d69b --- /dev/null +++ b/DataVaultGenerator/Entities/Reference.py @@ -0,0 +1,39 @@ +from DataVaultGenerator.Components import DataVaultEntity + +#TODO: um Möglichkeit, einen Satellite dranzuhängen erweitern sowie Laden über Mapping ermöglichen +class Reference(DataVaultEntity): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + @property + def key_attribute(self): #FIXME: nur EIN Key-Attribute könnte bei Referenz-Daten zu wenig sein. + return self.get_role_attribute('key') + #FIXME: Analag zu dem keyattribute (hk) müsste es für reference Daten einen globalen typ geben mit NOT NULL - Sonst gibt es Probleme beim PK-Index der NICHT NULL sein darf. Workarround: am key-attribue mandatory: true definieren + + def get_satellites(self): + return [e for e in self.model.get_entities_by_type('satellite') if e.get_parent_entity() == self] + + @property + def data(self): + return self._definition.get('data', {}) + + @property + def query(self): + return self.model.get_parsed_query(self, self.rawquery) + + def get_query_entities(self): + return self.model.get_query_entities(self.rawquery) + + @property + def rawquery(self): + return self._definition.get('query', '') + + def get_component_entities(self): + c = [{'entity': self, 'component': c, 'type': c.type} for c in + self.get_source_entities().values()] # holt derzeit nur die Deliveries über die Mappings + + c.extend([{'entity': self, 'component': c, 'type': c.type} + for c in self.model.get_entities_by_type('generictransformation') if self in c.get_target_entities() + ]) + #FIXME: um generictask erweitern + return c \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Report.py b/DataVaultGenerator/Entities/Report.py new file mode 100644 index 0000000..9a70a5a --- /dev/null +++ b/DataVaultGenerator/Entities/Report.py @@ -0,0 +1,26 @@ +from DataVaultGenerator.Components import ErrorCollection, GeneratorEntity + + +class Report(GeneratorEntity): + def __init__(self, model, filename, definition: dict = None): + GeneratorEntity.__init__(self, model, filename, definition) + + self._layername = definition.get('layer', self.model.config.entitydefaults[self.type]['layer']) + + @property + def layer(self): + """Returns the entity layer.""" + return self.model.get_layer(self._layername) + + @property + def dbentity(self): + return None + + def get_component_entities(self): + return [] + + def get_attributes(self, roles=(), exclude=()): + return [] + + def validate(self): + return ErrorCollection() \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Satellite.py b/DataVaultGenerator/Entities/Satellite.py new file mode 100644 index 0000000..0cdd2ae --- /dev/null +++ b/DataVaultGenerator/Entities/Satellite.py @@ -0,0 +1,54 @@ +from DataVaultGenerator.Components import DataVaultEntity, DataVaultEntityAttribute, ErrorCollection + + +class Satellite(DataVaultEntity): + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + + self.parent = definition.get('parent') + + def get_foreign_attribute(self, name: str) -> DataVaultEntityAttribute: + return self.get_parent_key_attribute() if self.get_parent_key_attribute().name == name else None + + def get_parent_entity(self) -> DataVaultEntity: + return self.model.get_entity(self.parent) + + def get_parent_key_attribute(self) -> DataVaultEntityAttribute: + return self.get_parent_entity().key_attribute + + @property + def hashdiff_fk_attribute(self):#FIXME: Durch config/template flexibler gestalten + return self.get_role_attribute('hashdiff').copy(self.name + "_" + self.get_role_attribute('hashdiff').name) + + @property + def hash_attribute_trim(self): + return self._definition.get('hashdiff_attribute_treatment' + , self.model.config.datavault.hashdiff_attribute_treatment)\ + .get('trim', + self.model.config.datavault.hashdiff_attribute_treatment.trim) + @property + def hash_attribute_case(self): + return self._definition.get('hashdiff_attribute_treatment' + , self.model.config.datavault.hashdiff_attribute_treatment)\ + .get('case', + self.model.config.datavault.hashdiff_attribute_treatment.case) + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in + self.get_source_entities().values()] # holt derzeit nur die Deliveries über die Mappings + + def validate(self): + + errors = ErrorCollection() + + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + # Validating entity references: + if self.get_parent_entity() is None: + errors.add("VALIDATION ERROR", + (self.filename,"Satellite", "<" + self.name + ">"), + f'Parent <{self.parent}> not found') + + return errors \ No newline at end of file diff --git a/DataVaultGenerator/Entities/Sourcesystem.py b/DataVaultGenerator/Entities/Sourcesystem.py new file mode 100644 index 0000000..a6b32bf --- /dev/null +++ b/DataVaultGenerator/Entities/Sourcesystem.py @@ -0,0 +1,22 @@ +from DataVaultGenerator.Components import GeneratorEntity + + +class SourceSystem(GeneratorEntity): + def __init__(self, model, filename, definition: dict = None): + GeneratorEntity.__init__(self, model, filename, definition) + self.shortname = self._definition.get('shortname', self.name) + self.sys_specification = self._definition.get('sys_specification', '') + + def get_interfaces(self): + return [i for i in self.model.interfaces.values() if i.source_system == self] + + def get_interface_count(self): + return sum(1 for i in self.model.interfaces.values() if i.source_system == self) + + @property + def connection_name(self): + return self._definition.get('connectionname', '') + + @property + def sourcesystem_type(self): + return self._definition.get('sourcesystemtype', '') \ No newline at end of file diff --git a/DataVaultGenerator/Entities/SubDag.py b/DataVaultGenerator/Entities/SubDag.py new file mode 100644 index 0000000..a6242de --- /dev/null +++ b/DataVaultGenerator/Entities/SubDag.py @@ -0,0 +1,84 @@ +from DataVaultGenerator.Dag import DagNode +from DataVaultGenerator.Components import ErrorCollection, GeneratorEntity + + +class SubDag(GeneratorEntity): + def __init__(self, model, filename, definition: dict = None): + GeneratorEntity.__init__(self, model, filename, definition) + + self.entrypoints = definition.get('entrypoints',[]) + self.key = definition.get('key',definition.get('name')) + + self.excludes = definition.get('excludes',[]) + + self.tree = [] + + def validate(self): + + errors = ErrorCollection() + + # Validating entity references: + + for ep in self.entrypoints: + if self.model.get_entity(ep) is None: + errors.add("VALIDATION ERROR", + (self.filename,"SubDag", "<" + self.name + ">"), + f'Entrypoint <{ep}> not found') + + for ex in self.excludes: + if self.model.get_entity(ex) is None: + errors.add("VALIDATION ERROR", + (self.filename,"SubDag", "<" + self.name + ">"), + f'Exclude <{ex}> not found') + + return errors + + def get_entrypoints_nodes(self): + if self.entrypoints: + return [self.model.dag.get_node(n) for n in self.entrypoints] + else: + return [n for n in self.model.dag.get_roots()] + + def get_tree(self): + return self.get_nodes() + + def get_nodes(self): + self.model.dag.reset() + + if self.subtype == 'forward': + r = [] + for en in self.get_entrypoints_nodes(): + r.extend(self.model.dag.get_forward_tree(en,excludes=self.excludes)) + self.tree = self.dedup_tree(r) + return self.tree + + if self.subtype == 'backward': + r = [] + for en in self.get_entrypoints_nodes(): + r.extend(self.model.dag.get_backward_tree(en)) + + r = self.model.dag.reverse_level(r) + self.tree = self.dedup_tree(r) + return self.tree + + return [] + + def dedup_tree(self, tree: list): + dedup = {} + for e in tree: + if e.name not in dedup: + dedup[e.name] = e + elif dedup[e.name].level < e.level: # Replace if existing elements level is lower than current elements level + dedup[e.name] = e + return [e for e in dedup.values()] + + + + def get_leveldict(self, nodes: list) -> dict: + # returns dict. Each key represents one level. Each level contains a list of nodes. + ld = dict() + for n in nodes: + if n.level not in ld: + ld[n.level] = [] + ld[n.level].append(n) + return ld diff --git a/DataVaultGenerator/Entities/View.py b/DataVaultGenerator/Entities/View.py new file mode 100644 index 0000000..54a028c --- /dev/null +++ b/DataVaultGenerator/Entities/View.py @@ -0,0 +1,200 @@ +import collections +import re +from DataVaultGenerator.Components import DataVaultEntity, MappingSource, DBEntity, ErrorCollection + + +class ViewAttribute(): + def __init__(self, entity, definition): + self.entity = entity + self.definition = definition + self.name = definition.get('name') + self.attribute = entity.get_attribute(self.name) # DataVaultEntityAttribute + self.components = definition.get('components', []) + self.reference = definition.get('reference', '') # entity.attribute + self.referencetype = definition.get('referencetype', '') # 1:n, m:n, .. + self.order = definition.get('order') + + def get_components(self): + # returns the component attribute instances + return [self.entity.get_query_entity_by_alias(c.split('.')[0]).get_attribute(c.split('.')[1]) for c in self.components] + + def get_referenced_attribute(self): + if self.reference: + ref = self.reference.split('.') # reference: entityname.attributename + return self.entity.model.get_entity(ref[0]).get_attribute(ref[1]) + else: + return None + +class View(DataVaultEntity, MappingSource): + + # name: customer_d + # type: view + # subtype: dimension, fact + # layer: mart + # attributes: + # - {name: customer_id, type: 'char(40)', components: [h.customer_h_hk]} + # - {name: cust_no, type: 'varchar(32)', components: [h.cust_no]} + # materialize: true #default: false + # materialization: + # mode: merge # merge|full + # target: customer_d_mat # default: name+'_mat' + # layer: mart # default: same as view + # mergekeys: + # - customer_id + # - cust_no + + def __init__(self, model, filename, definition: dict = None): + DataVaultEntity.__init__(self, model, filename, definition) + MappingSource.__init__(self, model, self) + + self._viewattributes = collections.OrderedDict() + + self.materialize = definition.get('materialize', False) + self.materialization = definition.get('materialization', {}) + + for attrdef in definition['attributes']: + self._viewattributes[attrdef.get('name')] = ViewAttribute(self, attrdef) + + def get_viewattributes(self, roles: list = 'all', exclude: list = ()): + """returns a list of attributes for one or more given roles. You can exclude certain attribute-roles""" + if 'all' in roles: + return [va for va in self._viewattributes.values() if va.attribute.role not in exclude] + else: + return [va for va in self._viewattributes.values() if va.attribute.role in roles and va.attribute.role not in exclude] + + def get_viewattribute(self, name): + return self._viewattributes.get(name) + + def safe_list_get(self, l, idx, default=None): + try: + return l[idx] + except IndexError: + return default + + @property + def query(self): + + parsed_result = self.rawquery + + for alias, entity in self.get_query_entities().items(): + if entity: + include_db = False if self.dbentity.database == entity.dbentity.database else True + replacement = self.model.basetemplates.get('query_entity_alias').render(entity=entity, includeDB=include_db, alias=str(alias)) + parsed_result = parsed_result.replace('{' + str(entity.name) + ':' + str(alias) + '}', replacement) + + return parsed_result + + @property + def rawquery(self): + return self._definition.get('query', '') + + def get_query_entities(self): + """ Parses Querystrings like: Select * from {entityname1:alias1} join {entityname2:alias2} and returns a list + of entity instances. """ + regex = r"\{(.*?):(.*?)?\}" + + entities = {} + + matches = re.finditer(regex, self.rawquery, re.MULTILINE) + for matchNum, match in enumerate(matches): + for groupNum in range(0, len(match.groups())): + entities[match.group(2)] = self.model.get_entity(match.group(1)) + + return entities + + def get_referenced_entities(self): + ref_entities = [] + for vattr in self._viewattributes.values(): + if vattr.reference: + e = vattr.get_referenced_attribute().entity + if e not in ref_entities: + ref_entities.append(e) + return ref_entities + + def get_query_entity_by_alias(self, alias): + return self.get_query_entities().get(alias) + + def get_component_entities(self): + return [{'entity': self, 'component': c, 'type': c.type} for c in self.get_query_entities().values()] + + def get_component_attributes(self, attributename): + components = [] + + viewattribute = self.get_viewattribute(attributename) + + return [{'attribute': viewattribute.attribute, + 'sourceentity': cattr.entity, + 'sourceattribute': cattr} for cattr in viewattribute.get_components() ] + + + @property + def materialization_dbentity(self): + return DBEntity(self.materialization.get('target'), + self, + self.model.config.layer.get(self.materialization.get('layer', self._layername )).get( + 'defaultdatabaseobject'), + None) + + @property + def materialization_rawquery(self): + return self.materialization.get('query', '') + + @property + def materialization_query(self): + return self.model.get_parsed_query(self, self.materialization_rawquery) + + def get_materialization_query_entities(self): + return self.model.get_query_entities(self.materialization_rawquery) + + + def validate(self): + + errors = ErrorCollection() + + for attr in self.attributes.values(): + spec = self.layer.sys_specification + errors.append(attr.validate(spec)) + + # Validating entity references: + if self._definition.get('query'): + for alias, entity in self.get_query_entities().items(): + if entity is None: + errors.add("VALIDATION ERROR", + (self.filename, "View", "<" + self.name + ">"), + f'Viewentity for alias <{alias}> not found.') + + #Skip next validations because of errors above: + if errors.count > 0: + return errors + + # Validating component references: + viewentities = self.get_query_entities() + for vattrname, vattr in self._viewattributes.items(): + for comp in vattr.components: + c = comp.split('.') + if c[0] not in viewentities.keys(): + errors.add("VALIDATION ERROR", + (self.filename, "View", "<" + self.name + ">", "Attribute <" + vattrname + ">"), + f'components: Viewentity for alias <{c[0]}> not found.') + + elif self.get_query_entity_by_alias(c[0]).get_attribute(c[1]) is None: + errors.add("VALIDATION ERROR", + (self.filename, "View", "<" + self.name + ">", "Attribute <" + vattrname + ">"), + f'components: Attribute <{c[1]}> for alias <{c[0]}> not found.') + + # Validating attribute references: + for vattrname, vattr in self._viewattributes.items(): + if vattr.reference: + ref = vattr.reference.split('.') + entity = self.model.get_entity(ref[0]) + + if entity is None: + errors.add("VALIDATION ERROR", + (self.filename, "View", "<" + self.name + ">", "Attribute <" + vattrname + ">"), + f'reference: Entity <{ref[0]}> not found.') + elif entity.get_attribute(ref[1]) is None: + errors.add("VALIDATION ERROR", + (self.filename, "View", "<" + self.name + ">", "Attribute <" + vattrname + ">"), + f'reference: Attribute <{ref[1]}> for entity <{ref[0]}> not found.') + + return errors \ No newline at end of file diff --git a/DataVaultGenerator/Mapping.py b/DataVaultGenerator/Mapping.py new file mode 100644 index 0000000..61a2500 --- /dev/null +++ b/DataVaultGenerator/Mapping.py @@ -0,0 +1,338 @@ +import logging +import re +from .Components import ErrorCollection, log + +''' + targetmapping = + + source: (class Mapping) + - target: costcenter_h (targetmapping) + type: mappingonly #d.h der Hub wird nicht aus dieser Tabelle beladen, aber die Satelliten und Links müssen das Mapping kennen + mappingmode: implicit|explicit + mapping: + - [creationname, creationname] + - target: costcentertype_h + type: master # Hauptquelle für hub + mapping: + - [creationname, creationname] +''' + +''' +SourcesMapping + - TargetMapping 1..n + - AttributeMapping 1..n + +''' + + +class AttributeMappingExpression: + """ + expression => Expression, e.g.: "concat({attribtute1},'-',{attribute2})" + entity => base entity + """ + def __init__(self, entity, expression: str = '', resulttype: str = '', alias:str = ''): + self._rawexpression = expression + self._resulttype = resulttype + self.entity = entity + self.alias = alias + + @property + def datatype(self): + return self._resulttype + + @property + def native_datatype(self): + """Returns the native datatype expression. E.g. nvarchar""" + return self.datatype[:self.datatype.find('(')].strip().lower() if self.datatype.find( + '(') != -1 else self.datatype + # IMPROVE: Müsste eigentlich in der Attributdefinition explizit drin stehen. + + @property + def native_datatypelength(self): + """Returns the native datatype length. E.g. nvarchar(100) -> 100 """ + return self.datatype[self.datatype.find('(') + 1:self.datatype.find(')')].strip().lower() if self.datatype.find('(') != -1 else '' + + # IMPROVE: Müsste eigentlich in der Attributdefinition explizit drin stehen. + + + @property + def expression(self): + template = self.entity.model.basetemplates.get('attribute_expression') + parsed_result = self._rawexpression + + for placeholder, attr in self.get_expression_attributes().items(): #IMPROVE: braucht man hier überhaupt die Attributeinstanzen? -> Der Name des platzhalters müsste ausreichen + if attr: + parsed_result = parsed_result.replace('{' + str(placeholder) + '}', template.render(component=[attr.name])) + + return parsed_result + + def get_expression_attributes(self): + """ Parses Querystrings like: concat({attribtute1},'-',{attribute2}) + and returns a dict of attribute instances. """ + regex = r"\{(.*?)?\}" + + attributes = {} + + matches = re.finditer(regex, self._rawexpression, re.MULTILINE) + for matchNum, match in enumerate(matches): + for groupNum in range(0, len(match.groups())): + attributes[match.group(1)] = self.entity.get_attribute(match.group(1)) + + return attributes + +class AttributeMapping: + def __init__(self, targetmapping, source:str, target:str, transformation:str=''): + self.targetmapping = targetmapping + + self._source = source # => 'attributename' or '{expression: "concat({attribute1},...)"}' + self._target = target # => 'attributename' + self.transformation = transformation + + def __repr__(self): + return "AttributeMapping: <{0}> -> <{1}>".format(self._source, self._target) + + @property + def source(self): + entity = self.targetmapping.sourceentity #IMPROVE: ggf. schon beim _init_ ermitteln, wenn damit die Validierung nicht vorweggenommen wird + if type(self._source) is dict: + return AttributeMappingExpression(entity , expression = self._source.get('expression') + , resulttype = self.target.datatype + , alias = self.target.name ) + else: #falls nur ein attributname angegeben wurde, erzeuge künstliche expression: + attr = entity.get_attribute(self._source) + return AttributeMappingExpression(entity , expression = '{'+ self._source+'}' + , resulttype = attr.datatype + , alias = attr.name ) + + @property + def target(self): + #entity = self.targetmapping.model.get_entity(self.targetmapping.targetentityname) #IMPROVE: ggf. schon beim _init_ ermitteln, wenn damit die Validierung nicht vorweggenommen wird + return self.targetmapping.targetentity.get_attribute(self._target) + + @property + def targetattribute_name(self): #fixme: müsste umgestellt werden auf target.name + return self._target + + @property + def transformation_name(self): + return self.transformation + + def validate(self): + + errors = ErrorCollection() + + if self.targetmapping.targetentity.get_attribute(self._target) is None: + errors.add("VALIDATION ERROR", + ("Mapping", "<" + self.targetmapping.sourceentityname + ">","target <"+self.targetmapping.targetentityname+">"), + f'target attribute <{self._target}> not found') + + sourceentity = self.targetmapping.sourceentity + if type(self._source) is dict: + + am_expr = AttributeMappingExpression(sourceentity , expression = self._source.get('expression'), resulttype = None, alias = None) + for attrname, attr in am_expr.get_expression_attributes().items(): + if attr is None: + errors.add("VALIDATION ERROR", + ("Mapping", "<" + self.targetmapping.sourceentityname + ">", + "target <" + self.targetmapping.targetentityname + ">"), + f'attribute <{attrname}> in sourceexpression "{am_expr._rawexpression}" not found') + + else: #falls nur ein attributname angegeben wurde, erzeuge künstliche expression: + + attr = sourceentity.get_attribute(self._source) + if attr is None: + errors.add("VALIDATION ERROR", + ("Mapping", "<" + self.targetmapping.sourceentityname + ">", + "target <" + self.targetmapping.targetentityname + ">"), + f'source attribute <{self._source}> not found') + + return errors + +class TargetMapping: + def __init__(self, model, sourceentityname: str, definition: dict = None): + self.model = model + self._definition = definition + self.sourceentityname = sourceentityname + self.targetentityname = definition.get('target') + + self.mappingmode = definition.get('mappingmode', 'implicit') + self.type = definition.get('type', 'master') + self.attributemappings = [] + self.explicitattributemappings = definition.get('mapping') + + # TODO: Wäre besser in der Validierung: + if self.targetentity is None: #vgl. @property + logging.error('mapping: <%s> - target <%s> not found', self.sourceentityname, self.targetentityname) + # durch denn vorzeitigen return, wird die Validierung trozdem ausgeführt, sodass zumindest die Konsolenausgabe passt. + return + + if self.mappingmode == 'implicit': + #get implicit roles from config.entitydefaults. + implicit_roles = self.model.config.entitydefaults.get(self.targetentity.type).get('map_implicit_roles', 'base') + for ta in self.targetentity.get_attributes(implicit_roles): + self.attributemappings.append(AttributeMapping(self, ta.name, ta.name)) # das funktioniert nicht, wenn später ein explizites mappin auftaucht, dass eine expression hat + logging.debug('Mapping <%s>: Created implicit attribute mappings for target <%s>: %s', self.sourceentityname, + self.targetentityname, self.attributemappings) + + if self.explicitattributemappings: + + logging.debug("explicit mappings: %s", self.explicitattributemappings) + logging.debug("result mappings (1): %s", self.attributemappings) + # process all explicit mapped attributes and overwrite existing implicitit attributes if exists: + i = 0 + for explicitattributemapping in self.explicitattributemappings: # am = attributemapping + existing_index = None + transformationname = explicitattributemapping[2] if len(explicitattributemapping) == 3 else '' + + if self.mappingmode == 'explicit': + self.attributemappings.append( + AttributeMapping(self, explicitattributemapping[0], explicitattributemapping[1], + transformationname)) + + else: + for am in self.attributemappings: + if am._source == explicitattributemapping[0] or am._target == explicitattributemapping[1]: + existing_index = i + logging.debug("replace Attribute mapping %s at index %s with mapping %s", am, + existing_index, explicitattributemapping) + i = 0 + break + else: + existing_index = None + i += 1 + + if (existing_index is not None): + logging.debug( + 'Mapping <%s>: Replace implicit Mapping: %s at index %s with explicit mapping: %s', + self.sourceentityname, self.attributemappings[existing_index], existing_index, #FIXME: Wenn es im Ziel ein implizit gemapptes Attribut mit einer Rolle gibt, die nicht in map_implicit_roles konfiguriert ist, gibt es einen index overflow + explicitattributemapping) + self.attributemappings[existing_index] = AttributeMapping(self, + explicitattributemapping[0], + explicitattributemapping[1], + transformationname) + else: + logging.debug('Mapping <%s>: Adding explicit attributemapping %s', self.sourceentityname, + explicitattributemapping) + self.attributemappings.append( + AttributeMapping(self, + explicitattributemapping[0], + explicitattributemapping[1], + transformationname)) + + logging.debug("result mappings (2): %s", self.attributemappings) + + @property + def targetentity(self): + return self.model.get_entity(self._definition.get('target')) + + @property + def sourceentity(self): + return self.model.get_entity(self.sourceentityname) + + def get_attribute_mappings(self): + return self.attributemappings + + def validate(self): + + errors = ErrorCollection() + + # ------ Validation Attributes: --------- + + # sourceentity = self.model.get_entity(self.sourceentityname) + + for am in self.get_attribute_mappings(): + errors.append(am.validate()) + + # TODO: Check for Data-Truncation (macht erst Sinn, wenn die data-type-definition explizit/sauber ist) + + # ------ Validation of explit mappings : --------- + if self.explicitattributemappings: + target =[] + for explicitattributemapping in self.explicitattributemappings: + if explicitattributemapping[1] not in target: + target.append(explicitattributemapping[1]) + else: + errors.add("VALIDATION ERROR", + ("Mapping", "<" + self.sourceentityname + ">", + "target <" + self.targetentityname + ">"), + f'More than one attribute from same source mapped to <{explicitattributemapping[1]}>') + + return errors + + +class Mapping: + def __init__(self, model, sourceentityname: str, filename: str, definition: dict = None): + self.model = model + self.definition = definition + self.targetmappings = {} + self.sourceentityname = sourceentityname + self.filename = filename + self.type = 'mapping' + + # FIXE: Das Laden der TargetMappings im _init_ verhindert eine klare Trennung von Validate und Laden. + for tm in self.definition: + self.targetmappings[tm.get('target')] = TargetMapping(model, sourceentityname, tm) + + def get_attribute_mappings_by_target(self, targetentityname: str): + tm = self.get_targetmapping_by_target(targetentityname) + am = tm.get_attribute_mappings() + implicit_roles = self.model.config.entitydefaults.get(tm.targetentity.type).get('map_implicit_roles', 'base') + if not am and tm.targetentity.get_attributes(implicit_roles): # Links müssen keine Attribute haben, daher nur Fehler melden, falls Link ein Attribut hat + logging.error(f"Mapping <%s>: No mapping for '{targetentityname}' found", self.sourceentityname) + return am + + def get_target_entities(self): + """returns list of direct mapped entites""" + return [tm.targetentity for tm in self.targetmappings.values()] + + def validate(self): + + errors = ErrorCollection() + + # ------ Validating Entity: --------- + + if self.model.get_entity(self.sourceentityname) is None: + errors.add("VALIDATION ERROR", + (self.filename, "Mapping", "<" + self.sourceentityname + ">"), + "delivery <" + self.sourceentityname + "> not found") + + return errors + + # ------ Validation of linked entities ----- + # Validation if all linked entities are present in the same mapping + + for tm in self.definition: + e = self.model.get_entity(tm.get('target')) + + if e is None: + errors.add("VALIDATION ERROR", + (self.filename, "Mapping", "<"+self.sourceentityname+">"), + "target <"+tm.get('target')+"> not found") + + if errors.count > 0: + return errors + + targetentities = self.get_target_entities() + + links = (e for e in targetentities if e.type == 'link') + + for link in links: + for le in link.get_linked_entities(): + if le not in targetentities: + errors.add("VALIDATION ERROR", + (self.filename, "Mapping", "<" + self.sourceentityname + ">"), + "linked entity for link <" + link.name + "> is missing. Please provide a mapping for <"+le.name+"> in this mapping") + + return errors + + # ------ Validating Targetmapping: --------- + for name, tm in self.targetmappings.items(): + errors.append(tm.validate()) + + return errors + + def get_targetmapping_by_target(self, target: str): + return self.targetmappings.get(target) + + def get_targetmappings(self): + return self.targetmappings diff --git a/DataVaultGenerator/Model.py b/DataVaultGenerator/Model.py new file mode 100644 index 0000000..57bb626 --- /dev/null +++ b/DataVaultGenerator/Model.py @@ -0,0 +1,923 @@ +#from DataVaultGenerator.Entities.Derived import Derived +from tokenize import String +from DataVaultGenerator.Entities.SubDag import SubDag +import logging +import glob +import time +from pathlib import Path, PurePath +from shutil import Error, copy2 + +from rich.progress import Progress +from rich.progress import TextColumn, BarColumn, SpinnerColumn, TaskProgressColumn, TimeElapsedColumn +from rich.tree import Tree +from rich.panel import Panel +from rich.table import Table +from rich.rule import Rule +from rich import box +from rich.style import Style +from rich import print + +import yaml +try: + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + +import sys +import hashlib +import re + +import os +import subprocess +from os import makedirs +from cerberus import Validator, schema_registry, rules_set_registry +from jinja2 import Environment, FileSystemLoader, TemplateNotFound, UndefinedError +from datetime import datetime + +from DataVaultGenerator.Config import Config +from DataVaultGenerator.Components import ErrorCollection +from DataVaultGenerator.Components import Layer, log +from DataVaultGenerator.Entities.Composite import Composite +from DataVaultGenerator.Entities.Delivery import Delivery +from DataVaultGenerator.Entities.GenericTable import GenericTable +from DataVaultGenerator.Entities.GenericTransformation import GenericTransformation +from DataVaultGenerator.Entities.GenericTask import GenericTask +from DataVaultGenerator.Entities.Hub import Hub +from DataVaultGenerator.Entities.Interface import Interface +from DataVaultGenerator.Entities.Link import Link +from DataVaultGenerator.Entities.PIT import PIT +from DataVaultGenerator.Entities.Bridge import Bridge +from DataVaultGenerator.Entities.Reference import Reference +from DataVaultGenerator.Entities.Report import Report +from DataVaultGenerator.Entities.Satellite import Satellite +from DataVaultGenerator.Entities.Sourcesystem import SourceSystem +from DataVaultGenerator.Entities.View import View +from DataVaultGenerator.Entities.SubDag import SubDag +from DataVaultGenerator.Mapping import Mapping +from DataVaultGenerator.Config import ConfigDict +from DataVaultGenerator.Dag import Dag,DagNode + +from DataVaultGenerator import __version__ + +class Model: + def __init__(self): + logging.info('Init Model') + + self.config = Config() + self.entities = {} + self.mappings = {} + self.sourcesystems = {} + self.subdags = {} + self.interfaces = {} + self.layer = {} + self.basetemplates = {} + self.cdc = {} + self.types = {} + self.dag = Dag(self) + self.sys_specifications = {} + + self.load_schema() + self.load_types() + + self.validator = Validator(allow_unknown=False) + + @property + def name(self): + return self.config.model.name + + def load_schema(self): + modpath = os.path.dirname(__file__) + for f in glob.glob(os.path.join(modpath, "schema/registry/*.yaml"), recursive=True): + with open(f, 'r') as file: + definition = yaml.load(file, Loader=Loader) + if definition.get('type') == 'schema': + schema_registry.add(definition.get('name'),definition.get('schema')) + else: + rules_set_registry.add(definition.get('name'),definition.get('schema')) + + + def load_types(self): + modpath = os.path.dirname(__file__) + + for f in glob.glob(os.path.join(modpath, "schema/entities/*.yaml"), recursive=False): + with open(f, 'r') as file: + definition = yaml.load(file, Loader=Loader) + self.types[definition.get('type')] = definition + + for f in glob.glob(os.path.join(modpath, "schema/*.yaml"), recursive=False): + with open(f, 'r') as file: + definition = yaml.load(file, Loader=Loader) + self.types[definition.get('type')] = definition + + def get_types(self): + return self.types + + def get_type_property(self, type, property): + return self.types.get(type,{}).get(property,'') + + def get_boilerplate(self,type): + return self.get_type_property(type,'boilerplate') + + def get_subtypes(self): + + return {'base': {'displayname': 'Base/Default'}, + 'drivingkeystatus': {'displayname': 'Status-Satellite for Driving Key'}, + 'fact': {'displayname': 'Fact Table/View'}, + 'dimension': {'displayname': 'Dimension Table/View'}, + '': {'displayname': 'n/a'} + } + + def get_type_displayname(self, entity_type): + return self.get_types().get(entity_type, {}).get('displayname', 'Unknown Type: ' + entity_type) + + def get_entities(self, generatable_only: bool = True ): + if generatable_only: + return {k:v for k, v in self.entities.items() if v.generate == 1} + else: + return self.entities + + def get_entities_by_type(self, entity_type: str, generatable_only: bool = True): + if generatable_only: + return [e for e in self.entities.values() if e.type == entity_type and e.generate == 1] + else: + return [e for e in self.entities.values() if e.type == entity_type] + + def get_entity(self, name: str): + return self.entities.get(name) + + def get_layers(self): + return self.layer + + def get_layer(self, name: str): + return self.layer.get(name) + + def get_source_systems(self): + return self.sourcesystems + + def get_source_system(self, name: str): + return self.sourcesystems.get(name) + + def get_subdags(self): + return self.subdags + + def get_subdag(self, name: str): + return self.subdags.get(name) + + def get_interfaces(self): + return self.interfaces + + def get_interface(self, name: str): + return self.interfaces.get(name) + + def get_interface_by_source_system(self, sourcesystem): + return [i for i in self.interfaces.values() if i.source_system == sourcesystem] + + def get_mapping(self, name: str): + """return a dict of mapping by Source.""" + return self.mappings.get(name) + + def get_mappings(self): + """return a dict of mappings.""" + return self.mappings + + def load_config(self, filename): + logging.info('reading config from %s', filename) + + self.config.load(filename, self.get_type_property('config','schema'), self.validate_definition) + + self.templateEnvironment = Environment( + loader=FileSystemLoader(self.config.path.joinpath(self.config.paths.templates)), + **self.config.jinja.environment + ) + + self.templateEnvironment.globals['generator_version'] = __version__ + self.templateEnvironment.globals['now'] = datetime.now + + #unfold vars + for k,v in self.config.vars.items(): + self.templateEnvironment.globals['_'+k+'_'] = v + + for layerid, layerdefinition in self.config.layer.items(): + self.layer[layerid] = Layer(self, layerid, layerdefinition) + + for templatekey, filename in self.config.basetemplates.items(): + try: + templatefilename = self.templateEnvironment.from_string(filename).render(model=self) + self.basetemplates[templatekey] = self.templateEnvironment.get_template(templatefilename) + + except TemplateNotFound: + print(f"Config: Base-Template {templatefilename} not found.") + logging.error(f"Config: Base-Template {templatefilename} not found.") + sys.exit(2) + + for k,v in self.config.sys_specification.items(): + folder = self.config.path.joinpath(v) + try: + with open(folder, 'r') as file: + specyaml = yaml.load(file, Loader=Loader) + + is_valid = self.validate_definition('sys_specification definition', self.get_type_property('sys_specification','schema'), specyaml) + if is_valid: + self.sys_specifications[k] = specyaml + else: + print(f"Config: sys_specification definition {folder} is not valid.") + logging.error(f"Config: sys_specification definition {folder} is not valid.") + sys.exit(2) + except FileNotFoundError as e: + print(f"Config: sys_specification {folder} not found.") + logging.error(e) + sys.exit(2) + + except yaml.scanner.ScannerError as e: + print("") + logging.error(e) + sys.exit(2) + + def get_config(self): + return self.config + + def get_file_content(self, filename: str): + with open(filename, 'r') as file: + return file.read() + + def save_file_content(self, filename: str, content): + print('Saving content to file: ' + filename) + with open(filename, 'w') as file: + file.write(content) + + + def create_entity(self, f, entityyml): + + entitytype = entityyml.get('type') + + #FIXME: einen Prozessschritt eher auswerten + schema = self.get_type_property(entitytype,'schema') + + if schema: + self.validate_definition(entityyml.get('type') + ': ' + entityyml.get('name'), schema, entityyml) + + classmap = { + 'hub': Hub + ,'delivery': Delivery + ,'satellite':Satellite + ,'link':Link + ,'view':View + ,'pit':PIT + ,'bridge':Bridge + ,'reference':Reference + ,'sourcesystem':SourceSystem + ,'source':Interface + ,'generictable':GenericTable + ,'generictransformation':GenericTransformation + ,'generictask':GenericTask + ,'report': Report + ,'composite': Composite + ,'subdag': SubDag + } + + if entitytype in classmap: + return classmap[entitytype](self, f, entityyml) + else: + return None + + + def add_entity(self, entity): + """adds a new entitity. If an entity with the same name exists, it wont work.""" + if entity.name not in self.entities: + self.entities[entity.name] = entity + else: + logging.error('entity %s already exists in model', entity.name) + + #if entity.type in ('delivery'): + # derived = Derived(self,'',entity,{'derived_from_type': 'delivery'}) + # self.entities[derived.name] = derived + + def update_entity(self, entity): + """replace an existing entity with a new one.""" + self.entities[entity.name] = entity + + def load_entity_from_file(self, filename): + pass + + def validate_definition(self, title, schema, definition, allow_unknown=False, failonerror=True): + + self.validator.allow_unknown = allow_unknown + + if not self.validator.validate(definition, schema): + tree = Tree(title) + for field, errors in self.validator.errors.items(): + for e in errors: + fieldtree = tree.add(field) + if type(e) is dict: + for itemno, itemerrors in e.items(): + itemtree = fieldtree.add('item' + str(itemno)) + #print(' |-- item ', itemno, ':') + for itemerror in itemerrors: + if type(itemerror) is dict: + for fieldname, fielderror in itemerror.items(): + itemtree.add(fieldname + ': ' + str(fielderror)) + #print(' |-- ', fieldname, ': ', fielderror) + else: + itemtree.add(str(itemerror)) + #print(' |-- ', itemerror) + else: + fieldtree.add(str(e)) + + print(Panel(tree, title="[red]SCHEMA ERROR", expand=False,padding=1 )) + + logging.error(self.validator.errors) + + if failonerror: + sys.exit(2) + return False + return True + + def validate_entities_schemas(self): + errorcount = 0 + folder = self.config.paths.entities + for f in glob.glob(folder + "/**/*.yaml", recursive=True): + + try: + + with open(f, 'r') as file: + entityyml = yaml.load(file, Loader=Loader) + baseschema = {'name': {'type': 'string', 'required': True} + , 'type': {'required': True, + 'allowed': list(self.get_types().keys())} + } + valid_base = self.validate_definition(f, baseschema, entityyml, True, False) + if valid_base: + entitytype = entityyml.get('type') + schema = self.get_type_property(entitytype, 'schema') + if schema: + valid = self.validate_definition(entityyml.get('type') + ': ' + entityyml.get('name'), + schema, entityyml, False, False) + if not valid: + errorcount += 1 + + else: + errorcount += 1 + except (yaml.scanner.ScannerError, UnicodeDecodeError) as e: + print("") + logging.error(e) + sys.exit(2) + + return errorcount + + def load_entities(self): + + folder = self.config.path.joinpath(self.config.paths.entities) + logging.info('reading entities from: %s', folder) + + baseschema = {'name': {'type': 'string', 'required': True} + , 'type': {'required': True, + 'allowed': list(self.get_types().keys())} + , 'description': {'type': 'string'} + , 'subtype': {'type': 'string'} + , 'generate': {'type': 'integer', 'allowed': [0, 1]} + } + + numfiles = len(list(folder.glob('**/*.yaml'))) + #i = 0 + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(style=Style(color="green")), + TimeElapsedColumn() + ) as progress: + + task1 = progress.add_task("[blue]Loading: ", total=numfiles) + + for f in sorted(folder.glob('**/*.yaml')): + + if f.name.startswith(self.config.model.ignore_file_prefix): + logging.info('ignore file because of prefix: %s', f.relative_to(folder)) + continue + + logging.info('reading entity: %s', f.relative_to(folder)) + try: + + with open(f, 'r') as file: + entityyml = yaml.load(file, Loader=Loader) + + if not entityyml: + print("") + logging.error('document empty: ' + f.name) + print('document empty: ', f.name) + sys.exit(2) + + self.validate_definition(f.name, baseschema, entityyml, True) + + + except (yaml.scanner.ScannerError, UnicodeDecodeError) as e: + print("") + logging.error(e) + sys.exit(2) + + entitytype = entityyml.get('type') + namespace = self.get_type_property(entitytype, 'namespace') + + if namespace == 'model': + self.add_entity(self.create_entity(f, entityyml)) + + if namespace == 'sourcesystem': + entityname = entityyml.get('name') + # IMPROVE: umsetzung inkonsistent zu entities -> self.add_entity + self.sourcesystems[entityname] = self.create_entity(f, entityyml) + + if namespace == 'source': + entityname = entityyml.get('name') + # IMPROVE: umsetzung inkonsistent zu entities -> self.add_entity + self.interfaces[entityname] = self.create_entity(f, entityyml) + + if namespace == 'dag': + entityname = entityyml.get('name') + # IMPROVE: umsetzung inkonsistent zu entities -> self.add_entity + self.subdags[entityname] = self.create_entity(f, entityyml) + + progress.update(task1, advance=1) + + + def load_mappings(self): + + folder = self.config.path.joinpath(self.config.paths.mappings) + logging.info('reading mappings from %s', folder) + + schema = self.get_type_property('mapping', 'schema') + + for f in sorted(folder.glob('**/*.yaml')): + logging.info('reading mapping: %s', f.relative_to(folder)) + try: + with open(f, 'r') as file: + mappingyml = yaml.load(file, Loader=Loader) + + self.validate_definition('Mapping: ' + f.name, schema, {'root': mappingyml}) + + for sourceentityname in mappingyml.keys(): + self.mappings[sourceentityname] = Mapping(self, sourceentityname, f.name, mappingyml[sourceentityname]) + + except yaml.scanner.ScannerError as e: + print("") + logging.error(e) + sys.exit(2) + + def validate_mappings(self): + + errors = ErrorCollection() + + mapping_count = len(self.mappings) + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) as progress: + + task1 = progress.add_task("[blue]Validating Mappings: ", total=mapping_count) + + for m in self.get_mappings().values(): + errors.append(m.validate()) + progress.update(task1, advance=1) + + for rm in errors.errors: # TODO: merge message-paths + log(logging.ERROR, rm.get('title'), rm.get('path'), rm.get('message'), True) + + if errors.count != 0: + logging.error('%i errors found while validating mappings.', errors.count) + + return errors + + def validate_entities(self): + + errors = ErrorCollection() + + entity_count = len(self.entities) + len(self.interfaces) + len(self.subdags) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) as progress: + + task1 = progress.add_task("[blue]Validating Entities: ", total=entity_count) + + for e in self.entities.values(): + progress.update(task1, advance=1) + errors.append(e.validate()) + + for e in self.get_interfaces().values(): + progress.update(task1, advance=1) + errors.append(e.validate()) + + for e in self.get_subdags().values(): + progress.update(task1, advance=1) + errors.append(e.validate()) + + #for rm in result['messages']: + # for p in rm.get('path'): + # print(p) + # print(rm.get('path'), rm.get('message')) + + for rm in errors.errors: # TODO: merge message-paths + + log(logging.ERROR, rm.get('title'), rm.get('path'), rm.get('message'), True) + + if errors.count != 0: + logging.error('%i errors found while validating entities.', errors.count) + + return errors + + def build_dag(self): + logging.info('building dag... ') + #TODO: Möglichkeit eine entity im Dag zu ignorieren einbauen (auf entity-level) + entity_count = len(self.get_entities().keys()) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) as progress: + + task1 = progress.add_task("[blue]Building Dag: ", total=entity_count) + + for i, (k,e) in enumerate(self.get_entities().items()): + progress.update(task1, advance=1) + namespace = self.get_type_property(e.type, 'namespace') + + if namespace =='model': + logging.info('adding node: %s', e.name) + self.dag.add_node(DagNode(e.name,e)) + for c in e.get_component_entities(): + c_namespace = self.get_type_property(c.get('type'), 'namespace') + + if c_namespace == 'model': + logging.info('adding edge: %s -> %s', c.get('component').name, e.name) + self.dag.add_edge((c.get('component').name, e.name)) + + + + def render_entity_templates(self): + + targetroot = self.config.path.joinpath(self.config.paths.output) + + logging.info('render entity templates to: %s', targetroot) + + entity_count = len(self.entities.keys()) +len(self.interfaces.keys()) + len(self.subdags.keys()) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) as progress: + + task1 = progress.add_task("[blue]Generating Entities: ", total=entity_count) + for i, (k,entity) in enumerate(self.entities.items() | self.interfaces.items() | self.subdags.items() ) : + progress.update(task1, advance=1) + + if entity.generate == 0: + logging.info('skipping Entity %s (generate=0)', entity.name) + continue + + if self.config.generator.get(entity.type.lower()): + + for templateconfig in self.config.generator.get(entity.type.lower()).get('templates'): + if entity.subtype in templateconfig.get('subtype', 'base'): + targetfolder = templateconfig['targetfolder'] + outfile = templateconfig['filename'] + templatefilename = templateconfig['template'] + synchtarget = templateconfig.get('synchtarget','') + + #Parse targetfolder and -filename templates: + templatefilename = self.templateEnvironment.from_string(templatefilename).render(entity=entity, model=self) + targetfolder = self.templateEnvironment.from_string(targetfolder).render(entity=entity, model=self) + outfile = self.templateEnvironment.from_string(outfile).render(entity=entity, model=self) + synchtarget = self.templateEnvironment.from_string(synchtarget).render(entity=entity, model=self) + + targetfolder = targetroot.joinpath(targetfolder) + makedirs(targetfolder, exist_ok=True) + + filename = targetfolder.joinpath(outfile) + logging.info('rendering Entity %s with template "%s" to %s ...', + entity.name, templatefilename, filename.relative_to(targetroot)) + + output = entity.render_template(templatefilename) + if output: # Leere Template-Results werden nicht als File geschrieben + + checksum = hashlib.md5(bytes(output,encoding ='utf-8')).hexdigest() + + if not self.cdc.get(filename): + self.cdc[filename] = dict(changed='new') + + self.cdc[filename]['current'] = checksum + + if checksum != self.cdc.get(filename,{}).get('previous'): # Only write file if content has changed + with open(filename, "w") as file: + file.write(output) + if self.cdc[filename]['changed'] != 'new': + self.cdc[filename]['changed'] = 'update' + else: + self.cdc[filename]['changed'] = 'same' + + if synchtarget: + synchtarget = self.config.path.joinpath(synchtarget) + self.cdc[filename]['synchto'] = synchtarget + + + else: + logging.info('skipping Entity %s with template "%s" because the result is empty', entity.name, + templatefilename) + + + def render_model_templates(self): + targetroot = self.config.path.joinpath(self.config.paths.output) + + logging.info('render model templates to: %s', targetroot) + + entity_count = len(self.config.generator['model']['templates']) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) as progress: + + task1 = progress.add_task("[blue]Generating Model: ", total=entity_count) + for templateconfig in self.config.generator['model']['templates']: + progress.update(task1, advance=1) + + targetfolder = templateconfig['targetfolder'] + outfile = templateconfig['filename'] + synchtarget = templateconfig.get('synchtarget','') + + #Parse targetfolder and -filename templates: + templateconfig['template'] = self.templateEnvironment.from_string(templateconfig['template']).render(model=self) + targetfolder = self.templateEnvironment.from_string(targetfolder).render(model=self) + outfile = self.templateEnvironment.from_string(outfile).render(model=self) + synchtarget = self.templateEnvironment.from_string(synchtarget).render(model=self) + + targetfolder = targetroot.joinpath(targetfolder) + + makedirs(targetfolder, exist_ok=True) + + filename = targetfolder.joinpath(outfile) + logging.info('rendering Model to %s ...', filename.relative_to(targetroot)) + + try: + template = self.templateEnvironment.get_template(templateconfig['template']) + + output = template.render( + model=self, + templatename=templateconfig['template'] + ) + if output: # Leere Template-Results werden nicht als File geschrieben + + checksum = hashlib.md5(bytes(output,encoding ='utf-8')).hexdigest() + + if not self.cdc.get(filename): + self.cdc[filename] = dict(changed='new') + + self.cdc[filename]['current'] = checksum + + if checksum != self.cdc.get(filename,{}).get('previous'): # Only write file if content has changed + with open(filename, "w") as file: + file.write(output) + if self.cdc[filename]['changed'] != 'new': + self.cdc[filename]['changed'] = 'update' + else: + self.cdc[filename]['changed'] = 'same' + + if synchtarget: + synchtarget = self.config.path.joinpath(synchtarget) + self.cdc[filename]['synchto'] = synchtarget + + + except TemplateNotFound: + print("") + print(Panel(f"[red]Error while rendering model-templates[/red]: Template {templateconfig['template']} not found.", title="[red]RENDER ERROR", padding=1,title_align="left" )) + #print(f"Config: Template {templateconfig['template']} not found.") + logging.error(f"Template {templateconfig['template']} not found.") + sys.exit(2) + except UndefinedError as e: + print("") + logging.error(f"Error while rendering model with Template {templateconfig['template']} :") + logging.error(e) + print(f"Error while rendering model with Template {templateconfig['template']} :", e) + sys.exit(2) + + def run_model_hooks(self, type): + # type = 'pre_hooks', 'post_hooks' + if self.config.get(type): + for hookname, hook in self.config.get(type,{}).items(): + print('---------------------------------------------------------------------------------') + print('hook:', hookname) + print('---------------------------------------------------------------------------------') + result = subprocess.run(hook, capture_output=True, text=True,shell=True) + print(result.stdout) + print(result.stderr) + + def capture_changes_before(self): + targetroot = self.config.path.joinpath(self.config.paths.output) + + self.cdc = {} + + for f in sorted(targetroot.glob('**/*.*')): + filename = f #.relative_to(targetroot) + with open(f, 'r') as inputfile: + checksum = hashlib.md5(inputfile.read().encode('UTF-8')).hexdigest() + self.cdc[filename] = {'current': '', + 'previous': checksum, + 'changed': 'deleted' # set changed to 'deleted' - when rendering state changes to 'new', 'same', 'update' or stays 'deleted' + } + + self.cdc_time= time.time() + + #print(self.cdc) + + def capture_changes_after(self): + + #delete file not in output anymore + for filename, info in self.cdc.items(): + if info.get('changed') == 'deleted': + filename.unlink() + + #check for renamed files (compare new and deleted files) + for filename, info in self.cdc.items(): + if info.get('changed') == 'deleted': + for f, i in self.cdc.items(): + if i.get('changed') == 'new' and info.get('previous') == i.get('current'): + self.cdc[filename]['changed'] = 'renamed' # change 'deleted' to 'renamed' + self.cdc[filename]['newname'] = f + self.cdc[f]['changed'] = 'renametarget' # change 'new' to 'renametarget' + self.cdc[f]['oldname'] = filename + + def display_changes(self): + table = Table(show_edge=False, box=box.MINIMAL) + table.add_column("State", justify="right", no_wrap=True) + table.add_column("File", style="white") + + update = [f for f, i in self.cdc.items() if i.get('changed') == 'update'] + new = [f for f, i in self.cdc.items() if i.get('changed') == 'new'] + deleted = [f for f, i in self.cdc.items() if i.get('changed') == 'deleted'] + renamed = {f:i for f, i in self.cdc.items() if i.get('changed') == 'renamed'} + + for f in update: + table.add_row("[yellow]updated",str(f.relative_to(os.getcwd()))) + + for f in new: + table.add_row("[green]new",str(f.relative_to(os.getcwd()))) + + for f in deleted: + table.add_row("[red]deleted",str(f.relative_to(os.getcwd()))) + + for f, i in renamed.items(): + table.add_row("[blue]renamed",str(f.relative_to(os.getcwd())) + " => "+ str(i.get('newname').relative_to(os.getcwd())) ) + + if table.row_count > 0: + print(table) + else: + print("[grey]No files changed.") + # for state in ('update','new','deleted','renamed'): + # print(state + ': ') + # for k, v in self.cdc.items(): + # if v.get('changed') == state: + # if state == 'renamed': + # print(" {} => {}".format(k.relative_to(os.getcwd()), v.get('newname').relative_to(os.getcwd()))) + # else: + # print(" {}".format(k.relative_to(os.getcwd()))) +# + + # print(' ') + + def synch(self, fullsynch = False): + logging.info('Synching to targets: ...' ) + + if fullsynch: + items = [(k, v.get('synchto') ) for k, v in self.cdc.items() if v.get('synchto')] + else: + items = [(k, v.get('synchto') ) for k, v in self.cdc.items() if v.get('synchto') and v.get('changed') in ('update','new','renamed') ] + + #i = 0 + for item in items: + #i += 1 + #print_progressbar(i, len(items), prefix='Synching to Target: ', suffix='Complete') #TODO: auf rich.progress umstellen + logging.info("Copy {} to {}".format(item[0].relative_to(os.getcwd()), item[1] )) + makedirs( item[1], exist_ok=True) + copy2(item[0], item[1]) + + def get_parsed_query(self, entity, rawquery): + """ Parses Querystrings like: Select * from {entityname1} join {entityname2} + and returns a parsed query like Select * from [database].[dbo].[entityname1] + join [database].[dbo].[entityname2] """ + parsed_result = rawquery + + for placeholder, queryentity in self.get_query_entities(rawquery).items(): + if queryentity: + include_db = False if entity.dbentity.database == queryentity.dbentity.database else True + + parsed_result = parsed_result.replace('{' + str(placeholder) + '}', + queryentity.dbentity.get_qualifier(include_db)) + + return parsed_result + + def get_query_entities(self, rawquery): + """ Parses Querystrings like: Select * from {entityname1} join {entityname2} + and returns a list of entity instances. """ + regex = r"\{(.*?)?\}" + + entities = {} + + matches = re.finditer(regex, rawquery, re.MULTILINE) + for matchNum, match in enumerate(matches): + for groupNum in range(0, len(match.groups())): + entities[match.group(1)] = self.get_entity(match.group(1)) + + return entities + + def get_entity_name_suggestion(self, entity_type: list, name: str, maxdist: int = 5) -> String: + suggest = None + dist = maxdist + for e in self.entities.values(): + if e.type in entity_type: + t = self.get_levenshtein_distance(name, e.name) + if t < dist: + suggest = e.name + dist = t + if t == 1: + return suggest + #print(name, e, t) + return suggest + + + def get_levenshtein_distance(self, word1, word2) -> int: + word2 = word2.lower() + word1 = word1.lower() + matrix = [[0 for x in range(len(word2) + 1)] for x in range(len(word1) + 1)] + + for x in range(len(word1) + 1): + matrix[x][0] = x + for y in range(len(word2) + 1): + matrix[0][y] = y + + for x in range(1, len(word1) + 1): + for y in range(1, len(word2) + 1): + if word1[x - 1] == word2[y - 1]: + matrix[x][y] = min( + matrix[x - 1][y] + 1, + matrix[x - 1][y - 1], + matrix[x][y - 1] + 1 + ) + else: + matrix[x][y] = min( + matrix[x - 1][y] + 1, + matrix[x - 1][y - 1] + 1, + matrix[x][y - 1] + 1 + ) + + return matrix[len(word1)][len(word2)] + + def create_snapshot(self, filename): + logging.info('creating snaphot: %s', filename) + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + ) as progress: + + entity_count = len(self.entities.keys()) +len(self.interfaces.keys()) + len(self.subdags.keys()) + task1 = progress.add_task("[blue]Generating Snapshot: ", total=entity_count) + + with open(filename, 'w') as file: + docs = list() + for i, (k,entity) in enumerate(self.entities.items() | self.interfaces.items() | self.subdags.items() ): + docs.append(entity._definition) + progress.update(task1, advance=1) + + yaml.dump_all( + docs, + file, + default_flow_style=False, + explicit_start=False, + sort_keys=False + ) + + def load_snapshot(self, filename): + documents = dict() + with open(filename, 'r') as file: + for obj in list( yaml.load_all(file, Loader=yaml.FullLoader) ): + documents[obj.get('name')] = obj + print(documents) + + diff --git a/DataVaultGenerator/__init__.py b/DataVaultGenerator/__init__.py new file mode 100644 index 0000000..99d2a6f --- /dev/null +++ b/DataVaultGenerator/__init__.py @@ -0,0 +1 @@ +__version__ = '1.1.5' diff --git a/DataVaultGenerator/__main__.py b/DataVaultGenerator/__main__.py new file mode 100644 index 0000000..fb8bd00 --- /dev/null +++ b/DataVaultGenerator/__main__.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python + +import sys +import argparse +import logging +from DataVaultGenerator.Components import ErrorCollection +from DataVaultGenerator.Model import Model +from DataVaultGenerator import __version__ + +from rich import print +from rich.table import Table +from rich.panel import Panel +from rich import box + +MIN_PYTHON = (3, 9) + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + if sys.version_info < MIN_PYTHON: + sys.exit("Python %s.%s or later is required.\n" % MIN_PYTHON) + + parser = argparse.ArgumentParser(description='DataVaultGenerator') + parser.add_argument('config', help='Path of Config file') + parser.add_argument('-l','--loglevel', default='INFO', help='Loglevel: CRITICAL, ERROR, WARNING, INFO, DEBUG (default: %(default)s)') + parser.add_argument('-lf','--logfile', default='generator.log', help='Logfilename (default: %(default)s)') + parser.add_argument('-v', '--validate', dest='validateonly', help='Switch to run validation only', action='store_true') + parser.add_argument('--validateschema', dest='validateschemaonly', help='Switch to run validation of schema only',action='store_true') + parser.add_argument('--novalidate', help='Switch to skip validation',action='store_true') + parser.add_argument('--synch', help='Synchronize changed files to the target',action='store_true') + parser.add_argument('--fullsynch', help='Synchronize all files to the target',action='store_true') + parser.add_argument('--runhooks', help='Run pre- and post-hooks',action='store_true') + parser.add_argument('--snapshot', help='Create Snapshotfile',action='store_true') + + args = parser.parse_args() + + title = """\ + _____ _ __ __ _ _ _____ _ +| __ \ | | \ \ / / | | | / ____| | | +| | | | __ _| |_ __ \ \ / /_ _ _ _| | |_ | | __ ___ _ __ ___ _ __ __ _| |_ ___ _ __ +| | | |/ _` | __/ _` \ \/ / _` | | | | | __| | | |_ |/ _ \ '_ \ / _ \ '__/ _` | __/ _ \| '__| +| |__| | (_| | || (_| |\ / (_| | |_| | | |_ | |__| | __/ | | | __/ | | (_| | || (_) | | +|_____/ \__,_|\__\__,_| \/ \__,_|\__,_|_|\__| \_____|\___|_| |_|\___|_| \__,_|\__\___/|_| + """ + + print(Panel(title, expand=False, box=box.HORIZONTALS)) + + table = Table(show_header =False, show_edge=False ) + table.add_column("Prop", justify="right", style="white", no_wrap=True) + table.add_column("Value", style="white") + + table.add_row("Version", __version__) + table.add_row("Config", args.config) + + print(table) + + print('') + + numeric_level = getattr(logging, args.loglevel.upper(), None) + + logging.basicConfig(filename=args.logfile, + filemode='w', + level=numeric_level, + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %I:%M:%S') + + logging.info(title) + + dm = Model() + + #FIXME: boilerplate nutzen, um objekt per cli zu erstellen: dvgen add hub my_hub /path/tofile + #with open('hub_boiler.yaml', 'w') as the_file: + # the_file.write(dm.get_boilerplate('hub')) + + + dm.load_config(args.config) + + if args.runhooks: dm.run_model_hooks('pre_hooks') + + if args.validateschemaonly: + errorcount = dm.validate_entities_schemas() + print(errorcount, "Errors found during validation of entity-schemas") + + exit(0) + + dm.load_entities() + dm.load_mappings() + + if not args.novalidate: + overallerrorcount = 0 + errors = dm.validate_entities() + if errors.count != 0: + overallerrorcount += errors.count + print(errors.count, "Errors found during validation of Entities") + + if overallerrorcount != 0: + exit(2) + + errors = dm.validate_mappings() + if errors.count != 0: + overallerrorcount += errors.count + print(errors.count, "Errors found during validation of Mappings") + + if overallerrorcount != 0: + exit(2) + + if not args.validateonly: + dm.build_dag() + dm.capture_changes_before() + dm.render_entity_templates() + dm.render_model_templates() + + if args.snapshot: dm.create_snapshot('snapshot.yaml') + + dm.capture_changes_after() #FIXME: Wenn ein modeltemplate nicht verfügbar ist, erscheint es im Log aber nicht auf der Konsole und die nachfolgenden Zeilen werden nicht ausgeführt + + if args.synch or args.fullsynch: dm.synch(args.fullsynch) + + print('') + dm.display_changes() + print('') + + if args.runhooks: dm.run_model_hooks('post_hooks') + + + +if __name__ == "__main__": + main() diff --git a/DataVaultGenerator/schema/.DS_Store b/DataVaultGenerator/schema/.DS_Store new file mode 100644 index 0000000..11505ab Binary files /dev/null and b/DataVaultGenerator/schema/.DS_Store differ diff --git a/DataVaultGenerator/schema/config.yaml b/DataVaultGenerator/schema/config.yaml new file mode 100644 index 0000000..f83db33 --- /dev/null +++ b/DataVaultGenerator/schema/config.yaml @@ -0,0 +1,286 @@ +type: config +name: Configuration +displayname: Configuration +namespace: model +schema: +# ------------------------------------------------------------------------------------------------ +# Model +# ------------------------------------------------------------------------------------------------ + + model: + type: dict + schema: + name: + type: string + paths: + type: dict + schema: + log: + type: string + required: True + entities: + type: string + required: True + mappings: + type: string + required: True + templates: + type: string + required: True + output: + type: string + required: True + ignore_file_prefix: + type: string + +# ------------------------------------------------------------------------------------------------ +# Variables +# ------------------------------------------------------------------------------------------------ + vars: + type: dict + +# ------------------------------------------------------------------------------------------------ +# Hooks +# ------------------------------------------------------------------------------------------------ + pre_hooks: + type: dict + valuesrules: + type: list + + post_hooks: + type: dict + valuesrules: + type: list +# ------------------------------------------------------------------------------------------------ +# Key Definition +# ------------------------------------------------------------------------------------------------ + + keyattribute: + type: dict + required: True + schema: + type: + type: string + required: True + role: + type: string + required: True + mandatory: + type: boolean + ghost: + type: string + + zerokey: + type: string + +# ------------------------------------------------------------------------------------------------ +# Modelling Constraints +# ------------------------------------------------------------------------------------------------ + + constraints: + type: dict + schema: + enforce_bk_type: + type: [string, list] + +# ------------------------------------------------------------------------------------------------ +# HASH Definition +# ------------------------------------------------------------------------------------------------ + + hash_algorithm: + type: string + required: true + hash_separator: + type: string + required: true + hash_case: + type: string + required: True + allowed: ['upper', 'lower', 'keep'] + +# ------------------------------------------------------------------------------------------------ +# Business Key Treatment +# ------------------------------------------------------------------------------------------------ + + business_key_treatment: + type: dict + required: True + schema: + trim: + type: string + required: True + allowed: ['left', 'right', 'both'] + case: + type: string + required: True + allowed: ['upper', 'lower', 'keep'] + +# ------------------------------------------------------------------------------------------------ +# Hashdiff Attribute Treatment +# ------------------------------------------------------------------------------------------------ + + hashdiff_attribute_treatment: + type: dict + required: True + schema: + trim: + type: string + required: True + allowed: ['left', 'right', 'both'] + case: + type: string + required: True + allowed: ['upper', 'lower', 'keep'] + +# ------------------------------------------------------------------------------------------------ +# Common Attributes +# ------------------------------------------------------------------------------------------------ + + commonattributes: + type: dict + valuesrules: + type: dict + schema: + name: + type: string + required: True + type: + type: string + required: True + mandatory: + type: boolean + ghost: + type: string + +# ------------------------------------------------------------------------------------------------ +# Ghost-records +# ------------------------------------------------------------------------------------------------ + + ghostrecord: + type: dict + valuesrules: + type: string + +# ------------------------------------------------------------------------------------------------ +# Layer +# ------------------------------------------------------------------------------------------------ + + layer: + type: dict + valuesrules: + type: dict + schema: + name: + type: string + required: True + description: + type: string + connectionname: + type: string + sys_specification: + type: string + defaultdatabaseobject: + type: dict + schema: + database: + type: string + schema: + type: string + filegroup: + type: string + properties: + type: dict + +# ------------------------------------------------------------------------------------------------ +# Entity Defaults +# ------------------------------------------------------------------------------------------------ + + entitydefaults: + type: dict + valuesrules: + type: dict + schema: + layer: + type: string + required: True + attributes: + type: list + schema: + type: string + attribute_role: + type: string + map_implicit_roles: + type: list + schema: + type: string + extra: + type: dict + +# ------------------------------------------------------------------------------------------------ +# Generator config +# ------------------------------------------------------------------------------------------------ + + generator: + type: dict + valuesrules: + type: dict + schema: + templates: + type: list + schema: + type: dict + schema: + subtype: + type: [string,list] + template: + type: string + required: True + targetfolder: + type: string + required: True + filename: + type: string + required: True + lang: + type: string + required: True + synchtarget: + type: string + +# ------------------------------------------------------------------------------------------------ +# Base templates +# ------------------------------------------------------------------------------------------------ + + templates: + type: dict + schema: + column_ddl: + type: string + required: True + table_qualifier: + type: string + required: True + attribute_expression: + type: string + required: True + entity_key_name: + type: string + required: True + query_entity_alias: + type: string + required: True + +# ------------------------------------------------------------------------------------------------ +# Template Engine +# ------------------------------------------------------------------------------------------------ + + jinja: + type: dict + +# ------------------------------------------------------------------------------------------------ +# sys_specifications +# ------------------------------------------------------------------------------------------------ + + sys_specification: + type: dict + \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/bridge.yaml b/DataVaultGenerator/schema/entities/bridge.yaml new file mode 100644 index 0000000..88d5e13 --- /dev/null +++ b/DataVaultGenerator/schema/entities/bridge.yaml @@ -0,0 +1,52 @@ +type: bridge +name: bridge +displayname: Bridge Table +namespace: model +schema: +# GeneratorEntity + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + allowed: [base] + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific + updatemode: + type: string + allowed: [full, merge, append, custom] + snapshotattribute: + type: dict + schema: attribute + snapshotquery: + type: string + bridgeattributes: attributes + hubs: + type: list + schema: + type: string + links: + type: list + schema: + type: string + \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/composite.yaml b/DataVaultGenerator/schema/entities/composite.yaml new file mode 100644 index 0000000..7455906 --- /dev/null +++ b/DataVaultGenerator/schema/entities/composite.yaml @@ -0,0 +1,49 @@ +type: composite +name: composite +displayname: Composite +namespace: model +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + type: dict + schema: + name: + type: string + filegroup: + type: string + database: + type: string + schema: + type: string + properties: + type: dict + exclude_commonattributes: + type: list +# Attributes: + attributes: attributes +# Specific: + query: + type: string + required: True \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/delivery.yaml b/DataVaultGenerator/schema/entities/delivery.yaml new file mode 100644 index 0000000..f64695d --- /dev/null +++ b/DataVaultGenerator/schema/entities/delivery.yaml @@ -0,0 +1,60 @@ +type: delivery +name: delivery +displayname: Delivery +namespace: model +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific: + recordsource: + type: string + batchmode: + type: string + deltaattribute: + type: string + deltainitialvalue: + type: string + query: + type: string + interfaces: + type: list + required: True + schema: + type: string + sourcesystem: + type: string + sourcetype: + type: string + ldts_source: + type: string + properties: + type: dict + +# Attributes: + attributes: attributes + diff --git a/DataVaultGenerator/schema/entities/generictable.yaml b/DataVaultGenerator/schema/entities/generictable.yaml new file mode 100644 index 0000000..63a57e3 --- /dev/null +++ b/DataVaultGenerator/schema/entities/generictable.yaml @@ -0,0 +1,37 @@ +type: generictable +name: generictable +displayname: Generic Table +namespace: model +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string + +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific: + +# Attributes: + attributes: attributes \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/generictask.yaml b/DataVaultGenerator/schema/entities/generictask.yaml new file mode 100644 index 0000000..b3a8105 --- /dev/null +++ b/DataVaultGenerator/schema/entities/generictask.yaml @@ -0,0 +1,39 @@ +type: generictask +name: generictask +displayname: Generic Task +namespace: model +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + required: True + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + +# Specific: + sources: + type: list + required: True + targets: + type: list + required: True \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/generictransformation.yaml b/DataVaultGenerator/schema/entities/generictransformation.yaml new file mode 100644 index 0000000..75e28aa --- /dev/null +++ b/DataVaultGenerator/schema/entities/generictransformation.yaml @@ -0,0 +1,41 @@ +type: generictransformation +name: generictransformation +displayname: Generic Transformation +namespace: model +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + +# Specific: + query: + type: string + required: True + sources: + type: list + required: True + targets: + type: list + required: True \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/hub.yaml b/DataVaultGenerator/schema/entities/hub.yaml new file mode 100644 index 0000000..067cb17 --- /dev/null +++ b/DataVaultGenerator/schema/entities/hub.yaml @@ -0,0 +1,59 @@ +type: hub +name: Hub +displayname: Hub +namespace: model +schema: +# GeneratorEntity + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + +# Specific + key: + type: string + key_treatment: + type: dict + schema: + trim: + type: string + allowed: ['left', 'right', 'both'] + case: + type: string + allowed: ['upper', 'lower', 'keep'] + roleof: + type: string + caseSesitive: + type: integer + allowed: [0,1] + +# Attributes: + attributes: attributes + +boilerplate: | + name: {unique_name} + type: hub + key: primary_key_name # Hashkey + description: 'optional description' + attributes: + - {name: 'businesskey1', type: 'nvarchar(200)'} \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/link.yaml b/DataVaultGenerator/schema/entities/link.yaml new file mode 100644 index 0000000..1541bd2 --- /dev/null +++ b/DataVaultGenerator/schema/entities/link.yaml @@ -0,0 +1,52 @@ +type: link +name: Link +displayname: Link +namespace: model +schema: +# GeneratorEntity + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific + key: + type: string + hubs: + type: list + required: True + schema: + type: string + links: + type: list + schema: + type: string + drivingkeys: + type: list + schema: + type: string + +# Attributes: + attributes: attributes + \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/pit.yaml b/DataVaultGenerator/schema/entities/pit.yaml new file mode 100644 index 0000000..6fb97b2 --- /dev/null +++ b/DataVaultGenerator/schema/entities/pit.yaml @@ -0,0 +1,61 @@ +type: pit +name: pit +displayname: Point in Time Table +namespace: model +schema: +# GeneratorEntity + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + allowed: [base] + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific + snapshotmode: + type: string + required: True + allowed: [latest, snapshotquery, full, snapshottable] + baseentity: + type: string + required: True + satellites: + type: list + required: True + schema: + type: string + snapshotattribute: + type: dict + schema: attribute + snapshottable: + type: string + snapshottableattribute: + type: string + snapshotquery: + type: string + pitattributes: + type: list + schema: + type: list + query: + type: string \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/reference.yaml b/DataVaultGenerator/schema/entities/reference.yaml new file mode 100644 index 0000000..331c9f0 --- /dev/null +++ b/DataVaultGenerator/schema/entities/reference.yaml @@ -0,0 +1,41 @@ +type: reference +name: reference +displayname: Reference Table +namespace: model +schema: +# GeneratorEntity + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific + data: + type: list + schema: + type: list + query: + type: string +# Attributes: + attributes: attributes \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/report.yaml b/DataVaultGenerator/schema/entities/report.yaml new file mode 100644 index 0000000..5b15dd5 --- /dev/null +++ b/DataVaultGenerator/schema/entities/report.yaml @@ -0,0 +1,6 @@ +type: report +name: report +displayname: Report +namespace: reporting +schema: + \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/satellite.yaml b/DataVaultGenerator/schema/entities/satellite.yaml new file mode 100644 index 0000000..839b3b6 --- /dev/null +++ b/DataVaultGenerator/schema/entities/satellite.yaml @@ -0,0 +1,48 @@ +type: satellite +name: Satellite +displayname: Satellite +namespace: model +schema: +# GeneratorEntity + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + allowed: [base,drivingkeystatus] + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Specific + parent: + type: string + required: True + hashdiff_attribute_treatment: + type: dict + schema: + trim: + type: string + allowed: ['left', 'right', 'both'] + case: + type: string + allowed: ['upper', 'lower', 'keep'] +# Attributes: + attributes: attributes \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/source.yaml b/DataVaultGenerator/schema/entities/source.yaml new file mode 100644 index 0000000..e6ce240 --- /dev/null +++ b/DataVaultGenerator/schema/entities/source.yaml @@ -0,0 +1,26 @@ +type: source +name: source +displayname: Source interface +namespace: source +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + description: + type: string +# Specific: + sourcetype: + type: string + sourcesystem: + type: string + dbentity: + schema: dbentity + properties: + type: dict +# Attributes: + attributes: attributes + diff --git a/DataVaultGenerator/schema/entities/sourcesystem.yaml b/DataVaultGenerator/schema/entities/sourcesystem.yaml new file mode 100644 index 0000000..885c7ba --- /dev/null +++ b/DataVaultGenerator/schema/entities/sourcesystem.yaml @@ -0,0 +1,23 @@ +type: sourcesystem +name: sourcesystem +displayname: Source System +namespace: sourcesystem +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + description: + type: string +# Specific: + shortname: + type: string + connectionname: + type: string + sourcesystemtype: + type: string + sys_specification: + type: string \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/subdag.yaml b/DataVaultGenerator/schema/entities/subdag.yaml new file mode 100644 index 0000000..f923eaf --- /dev/null +++ b/DataVaultGenerator/schema/entities/subdag.yaml @@ -0,0 +1,29 @@ +type: subdag +name: subdag +displayname: Sub Dag +namespace: dag +schema: +# Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string +# Specific: + entrypoints: + type: list + excludes: + type: list + key: + type: string + \ No newline at end of file diff --git a/DataVaultGenerator/schema/entities/view.yaml b/DataVaultGenerator/schema/entities/view.yaml new file mode 100644 index 0000000..7dd0d99 --- /dev/null +++ b/DataVaultGenerator/schema/entities/view.yaml @@ -0,0 +1,59 @@ +type: view +name: view +displayname: View +namespace: model +schema: + # Generator Entity: + name: + type: string + required: True + type: + type: string + required: True + subtype: + type: string + generate: + type: integer + allowed: [0,1] + extra: + type: [list, dict, string, integer] + description: + type: string + sql_pre_hook: + type: string + sql_post_hook: + type: string +# DataVaultEntity + layer: + type: string + dbentity: + schema: dbentity + exclude_commonattributes: + type: list +# Attributes: + attributes: attributes +# Specific: + query: + type: string + required: True + materialize: + type: integer + allowed: [0,1] + materialization: + type: dict + schema: + mode: + type: string + allowed: ['merge', 'full'] + target: + type: string + layer: + type: string + mergekeys: + type: list + schema: + type: string + query: + type: string + + diff --git a/DataVaultGenerator/schema/mapping.yaml b/DataVaultGenerator/schema/mapping.yaml new file mode 100644 index 0000000..025f663 --- /dev/null +++ b/DataVaultGenerator/schema/mapping.yaml @@ -0,0 +1,34 @@ +type: mapping +name: mapping +displayname: Mapping +schema: + root: # Workaround, da cerberus dynamische roots nicht unterstützt + type: dict + valuesrules: + type: list + schema: + type: dict + schema: + target: + type: string + required: True + mappingmode: + type: string + allowed: ['explicit', 'implicit'] + type: + type: string + allowed: ['mappingonly', 'master'] + + mapping: + type: list + schema: + type: list + schema: + type: [string, dict] + schema: + expression: + type: string + required: True + resulttype: + type: string + diff --git a/DataVaultGenerator/schema/model.yaml b/DataVaultGenerator/schema/model.yaml new file mode 100644 index 0000000..a72f571 --- /dev/null +++ b/DataVaultGenerator/schema/model.yaml @@ -0,0 +1,5 @@ +type: model +name: model +displayname: Model +schema: + \ No newline at end of file diff --git a/DataVaultGenerator/schema/registry/attribute.yaml b/DataVaultGenerator/schema/registry/attribute.yaml new file mode 100644 index 0000000..3172fd6 --- /dev/null +++ b/DataVaultGenerator/schema/registry/attribute.yaml @@ -0,0 +1,42 @@ +name: attribute +type: schema +schema: + name: + type: string + required: True + type: + type: string + required: True + mandatory: + type: boolean + default: + type: string + description: + type: string + ghost: + type: string + role: + type: string + precision: + type: [integer, string] + scale: + type: [integer, string] + length: + type: [integer, string] + order: + type: integer + pii: + type: boolean + props: + type: dict + + # View related + reference: + type: string + referencetype: + type: string + + components: + type: list + schema: + type: string diff --git a/DataVaultGenerator/schema/registry/attributes.yaml b/DataVaultGenerator/schema/registry/attributes.yaml new file mode 100644 index 0000000..1a8ac02 --- /dev/null +++ b/DataVaultGenerator/schema/registry/attributes.yaml @@ -0,0 +1,9 @@ +name: attributes +type: ruleset +schema: + type: list + schema: + type: dict + schema: attribute + + \ No newline at end of file diff --git a/DataVaultGenerator/schema/registry/dbentity.yaml b/DataVaultGenerator/schema/registry/dbentity.yaml new file mode 100644 index 0000000..35319f5 --- /dev/null +++ b/DataVaultGenerator/schema/registry/dbentity.yaml @@ -0,0 +1,13 @@ +name: dbentity +type: schema +schema: + name: + type: string + filegroup: + type: string + database: + type: string + schema: + type: string + properties: + type: dict \ No newline at end of file diff --git a/DataVaultGenerator/schema/sys_specification.yaml b/DataVaultGenerator/schema/sys_specification.yaml new file mode 100644 index 0000000..84676bb --- /dev/null +++ b/DataVaultGenerator/schema/sys_specification.yaml @@ -0,0 +1,19 @@ +type: sys_specification +name: sys_specification +displayname: System Specification +namespace: lang +schema: + sys_specification: + type: string + required: true + objectnames: + type: string + datatypes: + type: dict + valuesrules: + type: dict + schema: + pattern: + type: string + required: True + \ No newline at end of file diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000..137e6a2 --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,15 @@ +Metadata-Version: 2.1 +Name: DataVaultGenerator +Version: 1.1.5 +Summary: BI Data Vault Generator package +Home-page: https://github.com/... +Author: Christoph Metz +Author-email: metz@bi-web.de +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: BI License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.7 +Description-Content-Type: text/markdown diff --git a/README.md b/README.md new file mode 100644 index 0000000..231ff5b --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Installationsanleitung + +## Bereitstellung Files + +```sh +install/DataVaultGenerator-1.0.0.tar.gz +``` + +## Virtual Environment erstellen und aktivieren + +Erstellen: +```sh +python3 -m venv venv +``` + +Aktivieren: +(Unix) +```sh +source venv/bin/activate +``` + +(Windows) +```sh +venv\Scripts\activate.bat +``` + +PIP aktualisieren: + +```sh +pip install --upgrade pip +``` + + +## Installation + +### Core +Generator: +```sh +pip install install/DataVaultGenerator-1.0.0.tar.gz +``` + +Abhängigkeiten (nicht mehr nötig): + +```sh +pip install -r requirements.txt +``` + +## Deinstallation + +Generator: +```sh +pip uninstall DataVaultGenerator +``` + +pip freeze > to-uninstall.txt +pip uninstall -r to-uninstall.txt diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8bfd5a1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0648d60 --- /dev/null +++ b/setup.py @@ -0,0 +1,39 @@ +import setuptools + +from DataVaultGenerator import __version__ + +setuptools.setup( + name="DataVaultGenerator", # Replace with your own username + dist_dir = "build/dist", + version=__version__, + author="Christoph Metz", + author_email="metz@bi-web.de", + description="BI Data Vault Generator package", + long_description="", + long_description_content_type="text/markdown", + url="https://github.com/...", + packages=['DataVaultGenerator', 'DataVaultGenerator.Entities'], #setuptools.find_packages(), + #include_package_data=True, + package_data={ + "DataVaultGenerator": ["schema/*.yaml", "schema/registry/*.yaml", "schema/entities/*.yaml"], + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BI License", + "Operating System :: OS Independent", + ], + python_requires='>=3.7', + install_requires=[ + 'pyyaml', + 'jinja2', + 'cerberus', + 'rich' + #'flask' + ], + entry_points={ + "console_scripts": [ + "dvgen = DataVaultGenerator.__main__:main" + ] + }, + #scripts=['dvgen.py'], +)