pw_tokenizer: Tokenization options for proto - Add a proto option for marking a field as tokenized. - Create tools for automatically detokenizing protos with tokenized fields. - Add missing inputs (ELF files for tests) to pw_tokenizer. - Copy inputs to the out directory for generated packages. Change-Id: If724cdb5e24ff3a86e89690806aa77fd4e7fdbe9 Reviewed-on: https://pigweed-review.googlesource.com/c/pigweed/pigweed/+/47741 Commit-Queue: Wyatt Hepler <hepler@google.com> Pigweed-Auto-Submit: Wyatt Hepler <hepler@google.com> Reviewed-by: Keir Mierle <keir@google.com> Reviewed-by: Ewout van Bekkum <ewout@google.com>

diff --git a/pw_build/python.gni b/pw_build/python.gni
index 4d4b014..b7bf71a 100644
--- a/pw_build/python.gni
+++ b/pw_build/python.gni

@@ -323,6 +323,9 @@
         if (defined(invoker.tests)) {
           sources += invoker.tests
         }
+        if (defined(invoker.inputs)) {
+          sources += invoker.inputs
+        }
 
         source_root = _source_root
         public_deps = _python_deps + _other_deps

diff --git a/pw_tokenizer/BUILD.gn b/pw_tokenizer/BUILD.gn
index 8a0305f..520beb2 100644
--- a/pw_tokenizer/BUILD.gn
+++ b/pw_tokenizer/BUILD.gn

@@ -20,6 +20,7 @@
 import("$dir_pw_build/target_types.gni")
 import("$dir_pw_docgen/docs.gni")
 import("$dir_pw_fuzzer/fuzzer.gni")
+import("$dir_pw_protobuf_compiler/proto.gni")
 import("$dir_pw_unit_test/test.gni")
 import("backend.gni")
 
@@ -351,6 +352,12 @@
   ]
 }
 
+pw_proto_library("proto") {
+  sources = [ "options.proto" ]
+  prefix = "pw_tokenizer/proto"
+  python_package = "py"
+}
+
 declare_args() {
   # pw_JAVA_NATIVE_INTERFACE_INCLUDE_DIRS specifies the paths to use for
   # building Java Native Interface libraries. If no paths are provided, targets
@@ -380,6 +387,9 @@
 }
 
 pw_doc_group("docs") {
-  sources = [ "docs.rst" ]
+  sources = [
+    "docs.rst",
+    "proto.rst",
+  ]
   inputs = [ "py/pw_tokenizer/encode.py" ]
 }

diff --git a/pw_tokenizer/docs.rst b/pw_tokenizer/docs.rst
index 168bc4a..7d246de 100644
--- a/pw_tokenizer/docs.rst
+++ b/pw_tokenizer/docs.rst

@@ -834,6 +834,16 @@
     return Detokenizer(kDefaultDatabase);
   }
 
+Protocol buffers
+----------------
+``pw_tokenizer`` provides utilities for handling tokenized fields in protobufs.
+See :ref:`module-pw_tokenizer-proto` for details.
+
+.. toctree::
+  :hidden:
+
+  proto.rst
+
 Base64 format
 =============
 The tokenizer encodes messages to a compact binary representation. Applications

diff --git a/pw_tokenizer/options.proto b/pw_tokenizer/options.proto
new file mode 100644
index 0000000..bc0b87a
--- /dev/null
+++ b/pw_tokenizer/options.proto

@@ -0,0 +1,35 @@
+// Copyright 2021 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+syntax = "proto3";
+
+package pw.tokenizer;
+
+import "google/protobuf/descriptor.proto";
+
+enum Tokenization {
+  // The field may contain plain text or any type of tokenized data (binary or
+  // prefixed Base64).
+  TOKENIZATION_OPTIONAL = 0;
+}
+
+// Define the tokenized option, which indicates the data format for text in a
+// bytes field.
+extend google.protobuf.FieldOptions {
+  // The field number was randomly selected from the reserved, internal use
+  // field numbers (50000-99999).
+  // TODO(pwbug/393): Register with the Protobuf Global Extension Registry:
+  //     https://github.com/protocolbuffers/protobuf/blob/HEAD/docs/options.md
+  Tokenization format = 78576;
+}

diff --git a/pw_tokenizer/proto.rst b/pw_tokenizer/proto.rst
new file mode 100644
index 0000000..2479dd4
--- /dev/null
+++ b/pw_tokenizer/proto.rst

@@ -0,0 +1,138 @@
+.. _module-pw_tokenizer-proto:
+
+------------------------------------
+Tokenized fields in protocol buffers
+------------------------------------
+Text may be represented in a few different ways:
+
+- Plain ASCII or UTF-8 text (``This is plain text``)
+- Base64-encoded tokenized message (``$ibafcA==``)
+- Binary-encoded tokenized message (``89 b6 9f 70``)
+- Little-endian 32-bit integer token (``0x709fb689``)
+
+``pw_tokenizer`` provides tools for working with protobuf fields that may
+contain tokenized text.
+
+Tokenized field protobuf option
+===============================
+``pw_tokenizer`` provides the ``pw.tokenizer.format`` protobuf field option.
+This option may be applied to a protobuf field to indicate that it may contain a
+tokenized string. A string that is optionally tokenized is represented with a
+single ``bytes`` field annotated with ``(pw.tokenizer.format) =
+TOKENIZATION_OPTIONAL``.
+
+For example, the following protobuf has one field that may contain a tokenized
+string.
+
+.. code-block:: protobuf
+
+  message MessageWithOptionallyTokenizedField {
+    bytes just_bytes = 1;
+    bytes maybe_tokenized = 2 [(pw.tokenizer.format) = TOKENIZATION_OPTIONAL];
+    string just_text = 3;
+  }
+
+Decoding optionally tokenized strings
+=====================================
+The encoding used for an optionally tokenized field is not recorded in the
+protobuf. Despite this, the text can reliably be decoded. This is accomplished
+by attempting to decode the field as binary or Base64 tokenized data before
+treating it like plain text.
+
+The following diagram describes the decoding process for optionally tokenized
+fields in detail.
+
+.. mermaid::
+
+  flowchart TD
+     start([Received bytes]) --> binary
+
+     binary[Decode as<br>binary tokenized] --> binary_ok
+     binary_ok{Detokenizes<br>successfully?} -->|no| utf8
+     binary_ok -->|yes| done_binary([Display decoded binary])
+
+     utf8[Decode as UTF-8] --> utf8_ok
+     utf8_ok{Valid UTF-8?} -->|no| base64_encode
+     utf8_ok -->|yes| base64
+
+     base64_encode[Encode as<br>tokenized Base64] --> display
+     display([Display encoded Base64])
+
+     base64[Decode as<br>Base64 tokenized] --> base64_ok
+
+     base64_ok{Fully<br>or partially<br>detokenized?} -->|no| is_plain_text
+     base64_ok -->|yes| base64_results
+
+     is_plain_text{Text is<br>printable?} -->|no| base64_encode
+     is_plain_text-->|yes| plain_text
+
+     base64_results([Display decoded Base64])
+     plain_text([Display text])
+
+Potential decoding problems
+---------------------------
+The decoding process for optionally tokenized fields will yield correct results
+in almost every situation. In rare circumstances, it is possible for it to fail,
+but these can be avoided with a low-overhead mitigation if desired.
+
+There are two ways in which the decoding process may fail.
+
+Accidentally interpreting plain text as tokenized binary
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If a plain-text string happens to decode as a binary tokenized message, the
+incorrect message could be displayed. This is very unlikely to occur. While many
+tokens will incidentally end up being valid UTF-8 strings, it is highly unlikely
+that a device will happen to log one of these strings as plain text. The
+overwhelming majority of these strings will be nonsense.
+
+If an implementation wishes to guard against this extremely improbable
+situation, it is possible to prevent it. This situation is prevented by
+appending 0xFF (or another byte never valid in UTF-8) to binary tokenized data
+that happens to be valid UTF-8 (or all binary tokenized messages, if desired).
+When decoding, if there is an extra 0xFF byte, it is discarded.
+
+Displaying undecoded binary as plain text instead of Base64
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If a message fails to decode as binary tokenized and it is not valid UTF-8, it
+is displayed as tokenized Base64. This makes it easily recognizable as a
+tokenized message and makes it simple to decode later from the text output (for
+example, with an updated token database).
+
+A binary message for which the token is not known may coincidentally be valid
+UTF-8 or ASCII. 6.25% of 4-byte sequences are composed only of ASCII characters.
+When decoding with an out-of-date token database, it is possible that some
+binary tokenized messages will be displayed as plain text rather than tokenized
+Base64.
+
+This situation is likely to occur, but should be infrequent. Even if it does
+happen, it is not a serious issue. A very small number of strings will be
+displayed incorrectly, but these strings cannot be decoded anyway. One nonsense
+string (e.g. ``a-D1``) would be displayed instead of another (``$YS1EMQ==``).
+Updating the token database would resolve the issue, though the non-Base64 logs
+would be difficult decode later from a log file.
+
+This situation can be avoided with the same approach described in
+`Accidentally interpreting plain text as tokenized binary`_. Appending
+an invalid UTF-8 character prevents the undecoded binary message from being
+interpreted as plain text.
+
+Python library
+==============
+The ``pw_tokenizer.proto`` module defines functions that may be used to
+detokenize protobuf objects in Python. The function
+:func:`pw_tokenizer.proto.detokenize_fields` detokenizes all fields annotated as
+tokenized, replacing them with their detokenized version. For example:
+
+.. code-block:: python
+
+  my_detokenizer = pw_tokenizer.Detokenizer(some_database)
+
+  my_message = SomeMessage(tokenized_field=b'$YS1EMQ==')
+  pw_tokenizer.proto.detokenize_fields(my_detokenizer, my_message)
+
+  assert my_message.tokenized_field == b'The detokenized string! Cool!'
+
+pw_tokenizer.proto
+------------------
+.. automodule:: pw_tokenizer.proto
+  :members:

diff --git a/pw_tokenizer/py/BUILD.gn b/pw_tokenizer/py/BUILD.gn
index 00128d1..b2d19e9 100644
--- a/pw_tokenizer/py/BUILD.gn
+++ b/pw_tokenizer/py/BUILD.gn

@@ -15,9 +15,14 @@
 import("//build_overrides/pigweed.gni")
 
 import("$dir_pw_build/python.gni")
+import("$dir_pw_protobuf_compiler/proto.gni")
 
 pw_python_package("py") {
-  setup = [ "setup.py" ]
+  generate_setup = {
+    name = "pw_tokenizer"
+    version = "0.0.1"
+    extra_requires = [ "serial" ]
+  }
   sources = [
     "generate_argument_types_macro.py",
     "generate_hash_macro.py",
@@ -29,12 +34,14 @@
     "pw_tokenizer/detokenize.py",
     "pw_tokenizer/elf_reader.py",
     "pw_tokenizer/encode.py",
+    "pw_tokenizer/proto/__init__.py",
     "pw_tokenizer/serial_detokenizer.py",
     "pw_tokenizer/tokens.py",
   ]
   tests = [
     "database_test.py",
     "decode_test.py",
+    "detokenize_proto_test.py",
     "detokenize_test.py",
     "elf_reader_test.py",
     "encode_test.py",
@@ -42,10 +49,18 @@
     "tokens_test.py",
     "varint_test_data.py",
   ]
+  python_test_deps = [ ":test_proto.python" ]
   inputs = [
     "elf_reader_test_binary.elf",
     "example_binary_with_tokenized_strings.elf",
     "example_legacy_binary_with_tokenized_strings.elf",
   ]
+  proto_library = "..:proto"
   pylintrc = "$dir_pigweed/.pylintrc"
 }
+
+pw_proto_library("test_proto") {
+  sources = [ "detokenize_proto_test.proto" ]
+  deps = [ "..:proto" ]
+  prefix = "pw_tokenizer_tests"
+}

diff --git a/pw_tokenizer/py/detokenize_proto_test.proto b/pw_tokenizer/py/detokenize_proto_test.proto
new file mode 100644
index 0000000..ff9439b
--- /dev/null
+++ b/pw_tokenizer/py/detokenize_proto_test.proto

@@ -0,0 +1,24 @@
+// Copyright 2021 The Pigweed Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy of
+// the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+
+syntax = "proto3";
+
+package this_pigweed_test;
+
+import "pw_tokenizer/proto/options.proto";
+
+message TheMessage {
+  bytes just_bytes = 1;
+  bytes message = 2 [(pw.tokenizer.format) = TOKENIZATION_OPTIONAL];
+}

diff --git a/pw_tokenizer/py/detokenize_proto_test.py b/pw_tokenizer/py/detokenize_proto_test.py
new file mode 100644
index 0000000..d1bc269
--- /dev/null
+++ b/pw_tokenizer/py/detokenize_proto_test.py

@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+# Copyright 2021 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+"""Tests decoding a proto with tokenized fields."""
+
+import unittest
+
+from pw_tokenizer_tests.detokenize_proto_test_pb2 import TheMessage
+
+from pw_tokenizer import detokenize, encode, tokens
+from pw_tokenizer.proto import detokenize_fields
+
+_DATABASE = tokens.Database(
+    [tokens.TokenizedStringEntry(0xAABBCCDD, "Luke, we're gonna have %s")])
+_DETOKENIZER = detokenize.Detokenizer(_DATABASE)
+
+
+class TestDetokenizeProtoFields(unittest.TestCase):
+    """Tests detokenizing optionally tokenized proto fields."""
+    def test_plain_text(self) -> None:
+        proto = TheMessage(message=b'boring conversation anyway!')
+        detokenize_fields(_DETOKENIZER, proto)
+        self.assertEqual(proto.message, b'boring conversation anyway!')
+
+    def test_binary(self) -> None:
+        proto = TheMessage(message=b'\xDD\xCC\xBB\xAA\x07company')
+        detokenize_fields(_DETOKENIZER, proto)
+        self.assertEqual(proto.message, b"Luke, we're gonna have company")
+
+    def test_base64(self) -> None:
+        base64 = encode.prefixed_base64(b'\xDD\xCC\xBB\xAA\x07company')
+        proto = TheMessage(message=base64.encode())
+        detokenize_fields(_DETOKENIZER, proto)
+        self.assertEqual(proto.message, b"Luke, we're gonna have company")
+
+    def test_plain_text_with_prefixed_base64(self) -> None:
+        base64 = encode.prefixed_base64(b'\xDD\xCC\xBB\xAA\x09pancakes!')
+        proto = TheMessage(message=f'Good morning, {base64}'.encode())
+        detokenize_fields(_DETOKENIZER, proto)
+        self.assertEqual(proto.message,
+                         b"Good morning, Luke, we're gonna have pancakes!")
+
+    def test_unknown_token_not_utf8(self) -> None:
+        proto = TheMessage(message=b'\xFE\xED\xF0\x0D')
+        detokenize_fields(_DETOKENIZER, proto)
+        self.assertEqual(proto.message.decode(),
+                         encode.prefixed_base64(b'\xFE\xED\xF0\x0D'))
+
+    def test_only_control_characters(self) -> None:
+        proto = TheMessage(message=b'\1\2\3\4')
+        detokenize_fields(_DETOKENIZER, proto)
+        self.assertEqual(proto.message.decode(),
+                         encode.prefixed_base64(b'\1\2\3\4'))
+
+
+if __name__ == '__main__':
+    unittest.main()

diff --git a/pw_tokenizer/py/pw_tokenizer/proto/__init__.py b/pw_tokenizer/py/pw_tokenizer/proto/__init__.py
new file mode 100644
index 0000000..ac2e95d
--- /dev/null
+++ b/pw_tokenizer/py/pw_tokenizer/proto/__init__.py

@@ -0,0 +1,73 @@
+# Copyright 2021 The Pigweed Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+"""Utilities for working with tokenized fields in protobufs."""
+
+from typing import Iterator
+
+from google.protobuf.descriptor import FieldDescriptor
+from google.protobuf.message import Message
+
+from pw_tokenizer.proto import options_pb2
+from pw_tokenizer import detokenize, encode
+
+
+def _tokenized_fields(proto: Message) -> Iterator[FieldDescriptor]:
+    for field in proto.DESCRIPTOR.fields:
+        extensions = field.GetOptions().Extensions
+        if options_pb2.format in extensions and extensions[
+                options_pb2.format] == options_pb2.TOKENIZATION_OPTIONAL:
+            yield field
+
+
+def decode_optionally_tokenized(detokenizer: detokenize.Detokenizer,
+                                data: bytes) -> str:
+    """Decodes data that may be plain text or binary / Base64 tokenized text."""
+    # Try detokenizing as binary.
+    result = detokenizer.detokenize(data)
+    if result.ok():
+        return str(result)
+
+    # Attempt to decode as UTF-8.
+    try:
+        text = data.decode()
+    except UnicodeDecodeError:
+        # Not UTF-8. Assume the token is unknown or the data is corrupt.
+        return encode.prefixed_base64(data)
+
+    # See if the string is prefixed Base64 or contains prefixed Base64.
+    detokenized = detokenize.detokenize_base64(detokenizer, data)
+    if detokenized != data:  # If anything detokenized successfully, use that.
+        return detokenized.decode()
+
+    # Attempt to determine whether this is an unknown token or plain text.
+    # Any string with only printable or whitespace characters is plain text.
+    if ''.join(text.split()).isprintable():
+        return text
+
+    # Assume this field is tokenized data that could not be decoded.
+    return encode.prefixed_base64(data)
+
+
+def detokenize_fields(detokenizer: detokenize.Detokenizer,
+                      proto: Message) -> None:
+    """Detokenizes fields annotated as tokenized in the given proto.
+
+    The fields are replaced with their detokenized version in the proto.
+    Tokenized fields are bytes fields, so the detokenized string is stored as
+    bytes. Call .decode() to convert the detokenized string from bytes to str.
+    """
+    for field in _tokenized_fields(proto):
+        decoded = decode_optionally_tokenized(detokenizer,
+                                              getattr(proto, field.name))
+        setattr(proto, field.name, decoded.encode())