| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/dom_distiller/core/page_features.h" | 5 #include "components/dom_distiller/core/page_features.h" |
| 6 | 6 |
| 7 #include <string> | 7 #include <string> |
| 8 #include <vector> | 8 #include <vector> |
| 9 | 9 |
| 10 #include "base/files/file_util.h" | 10 #include "base/files/file_util.h" |
| 11 #include "base/json/json_reader.h" | 11 #include "base/json/json_reader.h" |
| 12 #include "base/json/json_writer.h" | 12 #include "base/json/json_writer.h" |
| 13 #include "base/memory/scoped_ptr.h" | 13 #include "base/memory/scoped_ptr.h" |
| 14 #include "base/path_service.h" | 14 #include "base/path_service.h" |
| 15 #include "testing/gtest/include/gtest/gtest.h" | 15 #include "testing/gtest/include/gtest/gtest.h" |
| 16 #include "third_party/WebKit/public/platform/WebDistillability.h" |
| 17 #include "url/gurl.h" |
| 16 | 18 |
| 17 namespace dom_distiller { | 19 namespace dom_distiller { |
| 18 | 20 |
| 19 // This test uses input data of core features and the output of the training | 21 // This test uses input data of core features and the output of the training |
| 20 // pipeline's derived feature extraction to ensure that the extraction that is | 22 // pipeline's derived feature extraction to ensure that the extraction that is |
| 21 // done in Chromium matches that in the training pipeline. | 23 // done in Chromium matches that in the training pipeline. |
| 22 TEST(DomDistillerPageFeaturesTest, TestCalculateDerivedFeatures) { | |
| 23 base::FilePath dir_source_root; | |
| 24 EXPECT_TRUE(PathService::Get(base::DIR_SOURCE_ROOT, &dir_source_root)); | |
| 25 std::string input_data; | |
| 26 ASSERT_TRUE(base::ReadFileToString( | |
| 27 dir_source_root.AppendASCII( | |
| 28 "components/test/data/dom_distiller/core_features.json"), | |
| 29 &input_data)); | |
| 30 std::string expected_output_data; | |
| 31 // This file contains the output from the calculation of derived features in | |
| 32 // the training pipeline. | |
| 33 ASSERT_TRUE(base::ReadFileToString( | |
| 34 dir_source_root.AppendASCII( | |
| 35 "components/test/data/dom_distiller/derived_features.json"), | |
| 36 &expected_output_data)); | |
| 37 | 24 |
| 38 scoped_ptr<base::Value> input_json = base::JSONReader::Read(input_data); | 25 TEST(DomDistillerPageFeaturesTest, TestPath) { |
| 39 ASSERT_TRUE(input_json); | 26 blink::WebDistillabilityFeatures f = blink::WebDistillabilityFeatures(); |
| 40 | 27 |
| 41 scoped_ptr<base::Value> expected_output_json = | 28 GURL url("http://example.com/search/view/index/the-title-of-archive.php"); |
| 42 base::JSONReader::Read(expected_output_data); | |
| 43 ASSERT_TRUE(expected_output_json); | |
| 44 | 29 |
| 45 base::ListValue* input_entries; | 30 std::vector<double> derived(CalculateDerivedFeatures(f, url)); |
| 46 ASSERT_TRUE(input_json->GetAsList(&input_entries)); | 31 EXPECT_EQ(kDerivedFeaturesCount, derived.size()); |
| 47 ASSERT_GT(input_entries->GetSize(), 0u); | |
| 48 | 32 |
| 49 base::ListValue* expected_output_entries; | 33 EXPECT_EQ(0, lround(derived[1])); |
| 50 ASSERT_TRUE(expected_output_json->GetAsList(&expected_output_entries)); | 34 EXPECT_EQ(1, lround(derived[2])); |
| 51 ASSERT_EQ(expected_output_entries->GetSize(), input_entries->GetSize()); | 35 EXPECT_EQ(1, lround(derived[3])); |
| 36 EXPECT_EQ(1, lround(derived[4])); |
| 37 EXPECT_EQ(1, lround(derived[5])); |
| 38 EXPECT_EQ(0, lround(derived[6])); |
| 39 EXPECT_EQ(0, lround(derived[7])); |
| 40 EXPECT_EQ(1, lround(derived[8])); |
| 41 EXPECT_EQ(43, lround(derived[9])); |
| 42 EXPECT_EQ(0, lround(derived[10])); |
| 43 EXPECT_EQ(4, lround(derived[11])); |
| 44 EXPECT_EQ(4, lround(derived[12])); |
| 45 EXPECT_EQ(0, lround(derived[13])); |
| 46 EXPECT_EQ(24, lround(derived[14])); |
| 47 } |
| 52 | 48 |
| 53 // In the output, the features list is a sequence of labels followed by values | 49 TEST(DomDistillerPageFeaturesTest, TestPath2) { |
| 54 // (so labels at even indices, values at odd indices). | 50 blink::WebDistillabilityFeatures f = blink::WebDistillabilityFeatures(); |
| 55 base::DictionaryValue* entry; | |
| 56 base::ListValue* derived_features; | |
| 57 ASSERT_TRUE(expected_output_entries->GetDictionary(0, &entry)); | |
| 58 ASSERT_TRUE(entry->GetList("features", &derived_features)); | |
| 59 std::vector<std::string> labels; | |
| 60 for (size_t i = 0; i < derived_features->GetSize(); i += 2) { | |
| 61 std::string label; | |
| 62 ASSERT_TRUE(derived_features->GetString(i, &label)); | |
| 63 labels.push_back(label); | |
| 64 } | |
| 65 | 51 |
| 66 for (size_t i = 0; i < input_entries->GetSize(); ++i) { | 52 GURL url("http://example.com/phpbb/forum123/456.asp"); |
| 67 base::DictionaryValue* core_features; | |
| 68 ASSERT_TRUE(input_entries->GetDictionary(i, &entry)); | |
| 69 ASSERT_TRUE(entry->GetDictionary("features", &core_features)); | |
| 70 // CalculateDerivedFeaturesFromJSON expects a base::Value of the stringified | |
| 71 // JSON (and not a base::Value of the JSON itself) | |
| 72 std::string stringified_json; | |
| 73 ASSERT_TRUE(base::JSONWriter::Write(*core_features, &stringified_json)); | |
| 74 scoped_ptr<base::Value> stringified_value( | |
| 75 new base::StringValue(stringified_json)); | |
| 76 std::vector<double> derived( | |
| 77 CalculateDerivedFeaturesFromJSON(stringified_value.get())); | |
| 78 | 53 |
| 79 ASSERT_EQ(labels.size(), derived.size()); | 54 std::vector<double> derived(CalculateDerivedFeatures(f, url)); |
| 80 ASSERT_TRUE(expected_output_entries->GetDictionary(i, &entry)); | 55 EXPECT_EQ(kDerivedFeaturesCount, derived.size()); |
| 81 ASSERT_TRUE(entry->GetList("features", &derived_features)); | 56 EXPECT_EQ(1, lround(derived[1])); |
| 82 std::string entry_url; | 57 EXPECT_EQ(0, lround(derived[2])); |
| 83 ASSERT_TRUE(entry->GetString("url", &entry_url)); | 58 EXPECT_EQ(0, lround(derived[3])); |
| 84 for (size_t j = 0, value_index = 1; j < derived.size(); | 59 EXPECT_EQ(0, lround(derived[4])); |
| 85 ++j, value_index += 2) { | 60 EXPECT_EQ(0, lround(derived[5])); |
| 86 double expected_value; | 61 EXPECT_EQ(1, lround(derived[6])); |
| 87 if (!derived_features->GetDouble(value_index, &expected_value)) { | 62 EXPECT_EQ(1, lround(derived[7])); |
| 88 bool bool_value; | 63 EXPECT_EQ(0, lround(derived[8])); |
| 89 ASSERT_TRUE(derived_features->GetBoolean(value_index, &bool_value)); | 64 EXPECT_EQ(23, lround(derived[9])); |
| 90 expected_value = bool_value ? 1.0 : 0.0; | 65 EXPECT_EQ(0, lround(derived[10])); |
| 91 } | 66 EXPECT_EQ(3, lround(derived[11])); |
| 92 EXPECT_DOUBLE_EQ(derived[j], expected_value) | 67 EXPECT_EQ(1, lround(derived[12])); |
| 93 << "incorrect value for entry with url " << entry_url | 68 EXPECT_EQ(2, lround(derived[13])); |
| 94 << " for derived feature " << labels[j]; | 69 EXPECT_EQ(7, lround(derived[14])); |
| 95 } | |
| 96 } | |
| 97 } | 70 } |
| 71 |
| 72 TEST(DomDistillerPageFeaturesTest, TestPath3) { |
| 73 blink::WebDistillabilityFeatures f = blink::WebDistillabilityFeatures(); |
| 74 |
| 75 GURL url("https://example.com/"); |
| 76 |
| 77 std::vector<double> derived(CalculateDerivedFeatures(f, url)); |
| 78 EXPECT_EQ(kDerivedFeaturesCount, derived.size()); |
| 79 EXPECT_EQ(0, lround(derived[1])); |
| 80 EXPECT_EQ(0, lround(derived[2])); |
| 81 EXPECT_EQ(0, lround(derived[3])); |
| 82 EXPECT_EQ(0, lround(derived[4])); |
| 83 EXPECT_EQ(0, lround(derived[5])); |
| 84 EXPECT_EQ(0, lround(derived[6])); |
| 85 EXPECT_EQ(0, lround(derived[7])); |
| 86 EXPECT_EQ(0, lround(derived[8])); |
| 87 EXPECT_EQ(1, lround(derived[9])); |
| 88 EXPECT_EQ(1, lround(derived[10])); |
| 89 EXPECT_EQ(0, lround(derived[11])); |
| 90 EXPECT_EQ(0, lround(derived[12])); |
| 91 EXPECT_EQ(0, lround(derived[13])); |
| 92 EXPECT_EQ(0, lround(derived[14])); |
| 98 } | 93 } |
| 94 |
| 95 } |
| OLD | NEW |