Source code for external_data.agent

# -*- coding: utf-8 -*- {{{
# vim: set fenc=utf-8 ft=python sw=4 ts=4 sts=4 et:
# Copyright 2020, Battelle Memorial Institute.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# This material was prepared as an account of work sponsored by an agency of
# the United States Government. Neither the United States Government nor the
# United States Department of Energy, nor Battelle, nor any of their
# employees, nor any jurisdiction or organization that has cooperated in the
# development of these materials, makes any warranty, express or
# implied, or assumes any legal liability or responsibility for the accuracy,
# completeness, or usefulness or any information, apparatus, product,
# software, or process disclosed, or represents that its use would not infringe
# privately owned rights. Reference herein to any specific commercial product,
# process, or service by trade name, trademark, manufacturer, or otherwise
# does not necessarily constitute or imply its endorsement, recommendation, or
# favoring by the United States Government or any agency thereof, or
# Battelle Memorial Institute. The views and opinions of authors expressed
# herein do not necessarily state or reflect those of the
# United States Government or any agency thereof.
# under Contract DE-AC05-76RL01830
# }}}

import logging
import sys
import csv
import requests
from ast import literal_eval
from io import StringIO
from requests.auth import HTTPBasicAuth

from import Agent
from volttron.platform.agent import utils
from volttron.platform.messaging import headers as headers_mod
from volttron.platform.scheduling import periodic
from volttron.platform.messaging.utils import Topic

__author__ = 'Kyle Monson'
__copyright__ = 'Copyright (c) 2017, Battelle Memorial Institute'
__license__ = 'Apache 2.0'

_log = logging.getLogger(__name__)
__version__ = '1.0'

PUBLISH_TOPIC = Topic("{base}//{source}//{key}")

[docs]def external_data_agent(config_path, **kwargs): config = utils.load_config(config_path) interval = config.get('interval', 300) default_user = config.get('default_user') default_password = config.get('default_password') global_topic_prefix = config.get('global_topic_prefix', "") sources = config.get("sources", []) return ExternalData(interval, default_user, default_password, sources, global_topic_prefix, **kwargs)
[docs]class ExternalData(Agent): """Gathers and publishes JSON data available via a web api. """ def __init__(self, interval, default_user, default_password, sources, global_topic_prefix, **kwargs): super(ExternalData, self).__init__(**kwargs) self.interval = interval self.default_user = default_user self.default_password = default_password self.periodic = None self.default_config = {"interval": interval, "global_topic_prefix": global_topic_prefix, "default_user": default_user, "default_password": default_password} self.default_config["sources"] = self.sources = self._validate_sources(sources, global_topic_prefix)"config", self.default_config), actions=["NEW", "UPDATE"], pattern="config") def _validate_sources(self, old_sources, global_topic_prefix): # Simple validation of sources topic = PUBLISH_TOPIC(base=global_topic_prefix, source=None, key=None) new_sources = [] for source in old_sources: url = source.get("url") key_column = source.get("key") source_topic = source.get("topic", "") if url is None: _log.error("Missing url from source!") continue if key_column is None: topic_str = topic(source=source_topic, key="") if not topic_str: _log.error("{url} configured to publish without topic. Removing source".format(url=url)) continue new_sources.append(source) return new_sources def _configure(self, config_name, action, contents): config = self.default_config.copy() config.update(contents) _log.debug("Configuring External Data agent") global_topic_prefix = config.get('global_topic_prefix', "") self.sources = self._validate_sources(config.get("sources", []), global_topic_prefix) self.topic = PUBLISH_TOPIC(base=global_topic_prefix, source=None, key=None) self.default_user = config.get('default_user') self.default_password = config.get('default_password') try: interval = float(config.get("interval", 300.0)) except ValueError: _log.error("Error setting scrape interval, reverting to default of 300 seconds") interval = 300.0 if self.periodic is not None: self.periodic.cancel() self.periodic = self.core.schedule(periodic(interval), self._publish_data) def _publish_data(self): for source in self.sources: now = utils.get_aware_utc_now() now = utils.format_timestamp(now) headers = { headers_mod.DATE: now, headers_mod.TIMESTAMP: now } url = source.get("url") params = source.get("params") source_topic = source.get("topic", "") user = source.get("user", self.default_user) password = source.get("password", self.default_password) source_type = source.get("type", "raw") kwargs = {"params": params} if user is not None: kwargs["auth"] = HTTPBasicAuth(user, password) try: r = requests.get(url, **kwargs) r.raise_for_status() except Exception as e: _log.error("Failure to read from source {url} {reason}".format(url=url, reason=str(e))) continue try: if source_type.lower() == "json": self._handle_json(headers, r, url, source_topic, source) elif source_type.lower() == "csv": self._handle_csv(headers, r, url, source_topic, source) elif source_type.lower() == "raw": self._handle_raw(headers, r, url, source_topic, source) except Exception as e: _log.error("General failure during processing of source {url} {reason}".format(url=url, reason=str(e))) def _handle_json(self, headers, request, url, source_topic, source_params): keys = source_params.get("key", []) path = source_params.get("path", []) if isinstance(path, str): path = [path] if isinstance(keys, str): keys = [keys] try: data = request.json() except ValueError as e: _log.error("Failure to read from source {url} {reason}".format(url=url, reason=str(e))) return try: for path_name in path: data = data[path_name] except (KeyError, IndexError, TypeError) as e: _log.error("Failure to read from source {url} {reason}".format(url=url, reason=str(e))) return if isinstance(data, list) and keys: dropped_rows = False for row in data: missing_key = False key_value = row try: for key_name in keys: key_value = key_value[key_name] except (KeyError, IndexError, TypeError): missing_key = dropped_rows = True if missing_key: continue if not isinstance(key_value, str) or not key_value: dropped_rows = True continue topic = self.topic(source=source_topic, key=key_value) if not topic: dropped_rows = True continue'pubsub', topic=topic, message=row, headers=headers).get(timeout=10.0) if dropped_rows: _log.error("At least one key missing from the data from source {url}".format(url=url)) else: # The topic has already been verified as legit by now. topic = self.topic(source=source_topic, key="")'pubsub', topic=topic, message=data, headers=headers).get(timeout=10.0) def _handle_csv(self, headers, request, url, source_topic, source_params): key_column = source_params.get("key", "") flatten = source_params.get("flatten", False) parse_columns = source_params.get("parse", []) file_obj = StringIO(request.content) if flatten: to_flatten = csv.reader(file_obj) row_data = {} csv_rows_skipped = False for row in to_flatten: try: row_data[row[0]] = row[1] except IndexError: csv_rows_skipped = True if csv_rows_skipped: _log.warning("Skipped incomplete flatten rows in {url}".format(url=url)) csv_data = [row_data] else: csv_data = csv.DictReader(file_obj) if parse_columns: new_csv_data = [] for row in csv_data: for parse_column in parse_columns: value_string = "" try: value_string = row[parse_column] value = literal_eval(value_string) row[parse_column] = value except KeyError: pass except Exception: if value_string == "": row[parse_column] = None new_csv_data.append(row) csv_data = new_csv_data if key_column: dropped_rows = False for row in csv_data: key = row.pop(key_column, "") topic = self.topic(source=source_topic, key=key) if topic:'pubsub', topic=topic, message=row, headers=headers).get(timeout=10.0) else: dropped_rows = True if dropped_rows: _log.warning("Skipped rows with invalid topic from {url}".format(url=url)) else: # The topic has already been verified as legit by now. topic = self.topic(source=source_topic, key="") if flatten: data = csv_data[0] else: data = csv_data'pubsub', topic=topic, message=data, headers=headers).get(timeout=10.0) def _handle_raw(self, headers, request, url, source_topic, source_params): topic = self.topic(source=source_topic, key="")'pubsub', topic=topic, message=request.content, headers=headers).get(timeout=10.0)
[docs]def main(argv=sys.argv): """Main method called by the eggsecutable.""" try: utils.vip_main(external_data_agent) except Exception as e: _log.exception('unhandled exception: {}'.format(e))
if __name__ == '__main__': # Entry point for script sys.exit(main())