changeset 4255:38da10e4b593

mod_ogp: Update parsing logic to catch more cases
author Seve Ferrer <seve@delape.net>
date Wed, 18 Nov 2020 13:48:07 +0100
parents a4e182d7ff0a
children c4b9d4ba839b
files mod_ogp/mod_ogp.lua mod_ogp/test.lua
diffstat 2 files changed, 70 insertions(+), 49 deletions(-) [+]
line wrap: on
line diff
--- a/mod_ogp/mod_ogp.lua	Wed Nov 18 11:16:11 2020 +0100
+++ b/mod_ogp/mod_ogp.lua	Wed Nov 18 13:48:07 2020 +0100
@@ -30,32 +30,40 @@
 			local fastening = st.message({to = to, from = from}):tag("apply-to", {xmlns = "urn:xmpp:fasten:0", id = origin_id})
 			local found_metadata = false
 			local message_body = ""
-			for property, content in response_body:gmatch(ogp_pattern) do
-				module:log("info", property .. "\t" .. content)
-				fastening:tag(
-					"meta",
-					{
-						xmlns = "http://www.w3.org/1999/xhtml",
-						property = property,
-						content = content
-					}
-				):up()
-				found_metadata = true
-				message_body = message_body .. property .. "\t" .. content .. "\n"
+
+			local meta_pattern = [[<meta (.-)/?>]]
+			for match in response_body:gmatch(meta_pattern) do
+				local property = match:match([[property=%s*["']?(og:.-)["']?%s]])
+				if not property then
+					property = match:match([[property=["']?(og:.-)["']$]])
+				end
+
+				local content = match:match([[content=%s*["'](.-)["']%s]])
+				if not content then
+					content = match:match([[content=["']?(.-)["']$]])
+				end
+				if not content then
+					content = match:match([[content=(.-) property]])
+				end
+				if not content then
+					content = match:match([[content=(.-)$]])
+				end
+
+				if property and content then
+					module:log("info", property .. "\t" .. content)
+					fastening:tag(
+						"meta",
+						{
+							xmlns = "http://www.w3.org/1999/xhtml",
+							property = property,
+							content = content
+						}
+					):up()
+					found_metadata = true
+					message_body = message_body .. property .. "\t" .. content .. "\n"
+				end
 			end
-			for content, property in response_body:gmatch(ogp_pattern2) do
-				module:log("info", property .. "\t" .. content)
-				fastening:tag(
-					"meta",
-					{
-						xmlns = "http://www.w3.org/1999/xhtml",
-						property = property,
-						content = content
-					}
-				):up()
-				found_metadata = true
-				message_body = message_body .. property .. "\t" .. content .. "\n"
-			end
+
 
 			if found_metadata then
 				mod_muc.get_room_from_jid(room.jid):broadcast_message(fastening)
--- a/mod_ogp/test.lua	Wed Nov 18 11:16:11 2020 +0100
+++ b/mod_ogp/test.lua	Wed Nov 18 13:48:07 2020 +0100
@@ -1,30 +1,43 @@
 local html = [[
-<meta property="og:title" content="Example 1">
-<meta property=og:title content="Example 2">
-<meta property="og:title" content="Example 3" >
-<meta property="og:title" content="Example 4" />
-<meta property="og:title" content="Example 5"/>
-<meta property=og:title content=Example 6/>
-<meta property="og:title" content= "Example 7" />
-<meta property="og:title" itemprop="image primaryImageOfPage" content="Example 8" />
-<meta content="Example 9" property="og:title" >
-<meta content="Example 10" property="og:title">
-<meta content="Example 11" property="og:title"/>
-<meta content="Example 12" property="og:title" />
-<meta content="Example 13" property=og:title >
-<meta content=Example 14 property=og:title >
-<meta content= "Example 15" property="og:title" />
-<meta content="Example 16" itemprop="image primaryImageOfPage"  property="og:title" />
+<meta property="og:title" content="Example 1 A">
+<meta property=og:title content="Example 2 B">
+<meta property="og:title" content="Example 3 C" >
+<meta property="og:title" content="Example 4 D" />
+<meta property="og:title" content="Example 5 E"/>
+<meta property=og:title content=Example 6 F/>
+<meta property="og:title" content= "Example 7 G" />
+<meta property="og:title" itemprop="image primaryImageOfPage" content="Example 8 H" />
+<meta property='og:title' content='Example 9 I' />
+<meta content="Example 10 J" property="og:title" >
+<meta content="Example 11 K" property="og:title">
+<meta content="Example 12 L" property="og:title"/>
+<meta content="Example 13 M" property="og:title" />
+<meta content="Example 14 N" property=og:title >
+<meta content=Example 15 O property=og:title >
+<meta content= "Example 16 P" property="og:title" />
+<meta content="Example 17 Q" itemprop="image primaryImageOfPage"  property="og:title" />
+<meta content= 'Example 18 R' property='og:title' />
 ]]
 
 
-local ogp_pattern = [[<meta property=["']?(og:.-)["']? content=%s*["']?(.-)["']?%s-/?>]]
-local ogp_pattern2 = [[<meta content=%s*["']?(.-)["']? property=["']?(og:.-)["']?%s-/?>]]
+
+local meta_pattern = [[<meta (.-)/?>]]
+for match in html:gmatch(meta_pattern) do
+    local property = match:match([[property=%s*["']?(og:.-)["']?%s]])
+    if not property then
+        property = match:match([[property=["']?(og:.-)["']$]])
+    end
 
-for property, content in html:gmatch(ogp_pattern) do
-    print("Pattern 1|", property, content, "|Pattern 1")
+    local content = match:match([[content=%s*["'](.-)["']%s]])
+    if not content then
+        content = match:match([[content=["']?(.-)["']$]])
+    end
+    if not content then
+        content = match:match([[content=(.-) property]])
+    end
+    if not content then
+        content = match:match([[content=(.-)$]])
+    end
+
+    print(property, '\t', content, '\t', match .. "|")
 end
-print('-------------------------------------------------------------')
-for content, property in html:gmatch(ogp_pattern2) do
-    print("Pattern 2|", property, content, "|Pattern 2")
-end