I am attempting to read in a variety of jsonl files (not regular json) with sparklyr using the following code.
sample <- spark_read_json(sc, "sample.jsonl")
As the files are very large, this is a sample of two lines from sample.jsonl data:
{"created_at": "Thu Jun 18 21:00:08 +0000 2020", "id": 1273722186369585155, "id_str": "1273722186369585155", "text": "RT @spunbeam: Breonna Taylor\u2019s name is no longer trending, and the police that murdered her are still free. If you see this please reply/qu\u2026", "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>", "truncated": false, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 789469454660956160, "id_str": "789469454660956160", "name": "Princess Daddy \ud83d\udc78\ud83c\udffe\ud83d\udc95", "screen_name": "jadaciti", "location": "Lafayette, LA", "url": "https://youtu.be/TaEw0P8qSeo", "description": "a goddess \ud83e\uddde\u200d\u2640\ufe0f & a cancer \u264b\ufe0f #goddessgang \u2728 #BlackLivesMatter \u270a\ud83c\udffc\u270a\ud83c\udffd\u270a\ud83c\udffe\u270a\ud83c\udfff\ud83d\udda4", "translator_type": "none", "protected": false, "verified": false, "followers_count": 229, "friends_count": 152, "listed_count": 0, "favourites_count": 7665, "statuses_count": 7702, "created_at": "Fri Oct 21 14:12:40 +0000 2016", "utc_offset": null, "time_zone": null, "geo_enabled": false, "lang": null, "contributors_enabled": false, "is_translator": false, "profile_background_color": "F5F8FA", "profile_background_image_url": "", "profile_background_image_url_https": "", "profile_background_tile": false, "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "profile_image_url": "http://pbs.twimg.com/profile_images/1270931500297605121/tK6ICOLj_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1270931500297605121/tK6ICOLj_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/789469454660956160/1591421951", "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null}, "geo": null, "coordinates": null, "place": null, "contributors": null, "retweeted_status": {"created_at": "Wed Jun 17 15:21:31 +0000 2020", "id": 1273274580833050624, "id_str": "1273274580833050624", "text": "Breonna Taylor\u2019s name is no longer trending, and the police that murdered her are still free. If you see this pleas\u2026 https://t.co/f88oqSdhla", "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>", "truncated": true, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1199772686865616909, "id_str": "1199772686865616909", "name": "BLM Green Man \ud83d\udd1c TipDaddy", "screen_name": "spunbeam", "location": "BPD TOWN USA", "url": null, "description": "yer local greenbean\ud83e\udd74wobbling heavily since 2016\ud83c\udf44(*\u02d8\ufe36\u02d8*).\uff61*\ud83d\udc9a\ud83d\udd1c TipDaddy n Friends. \n21 - \u2653\u2600\ufe0f\u264a\ud83c\udf17\u264a\u2b06\ufe0f -Toxic for men, Wholesome for women \ud83c\udff3\ufe0f\u200d\ud83c\udf08\ud83c\udde8\ud83c\uddfa\ud83c\udf44\ud83d\ude3b\ud83c\udf32\ud83c\udfb6&LSD", "translator_type": "none", "protected": false, "verified": false, "followers_count": 651, "friends_count": 1224, "listed_count": 0, "favourites_count": 23214, "statuses_count": 4142, "created_at": "Wed Nov 27 19:31:20 +0000 2019", "utc_offset": null, "time_zone": null, "geo_enabled": false, "lang": null, "contributors_enabled": false, "is_translator": false, "profile_background_color": "F5F8FA", "profile_background_image_url": "", "profile_background_image_url_https": "", "profile_background_tile": false, "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "profile_image_url": "http://pbs.twimg.com/profile_images/1248737938885746695/5lgHu_18_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1248737938885746695/5lgHu_18_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/1199772686865616909/1583024393", "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null}, "geo": null, "coordinates": null, "place": null, "contributors": null, "is_quote_status": false, "extended_tweet": {"full_text": "Breonna Taylor\u2019s name is no longer trending, and the police that murdered her are still free. If you see this please reply/quote/tweet on ur own. ARREST THE COPS WHO KILLED #BREONNATAYLOR", "display_text_range": [0, 187], "entities": {"hashtags": [{"text": "BREONNATAYLOR", "indices": [173, 187]}], "urls": [], "user_mentions": [], "symbols": []}}, "quote_count": 780, "reply_count": 213, "retweet_count": 15755, "favorite_count": 12153, "entities": {"hashtags": [], "urls": [{"url": "https://t.co/f88oqSdhla", "expanded_url": "https://twitter.com/i/web/status/1273274580833050624", "display_url": "twitter.com/i/web/status/1\u2026", "indices": [117, 140]}], "user_mentions": [], "symbols": []}, "favorited": false, "retweeted": false, "filter_level": "low", "lang": "en"}, "is_quote_status": false, "quote_count": 0, "reply_count": 0, "retweet_count": 0, "favorite_count": 0, "entities": {"hashtags": [], "urls": [], "user_mentions": [{"screen_name": "spunbeam", "name": "BLM Green Man \ud83d\udd1c TipDaddy", "id": 1199772686865616909, "id_str": "1199772686865616909", "indices": [3, 12]}], "symbols": []}, "favorited": false, "retweeted": false, "filter_level": "low", "lang": "en", "timestamp_ms": "1592514008501"}
{"created_at": "Thu Jun 18 21:00:08 +0000 2020", "id": 1273722186436665346, "id_str": "1273722186436665346", "text": "RT @GreenEyeRaven1: If we have to remind you our bodies are ours ...we will...stop killing us\n#GBVmustfall #day84oflockdown #COVID19 #Prote\u2026", "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>", "truncated": false, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 504191291, "id_str": "504191291", "name": "GoldenMdee", "screen_name": "GoldenMdee", "location": "South Africa", "url": null, "description": "\u201cBe kind to the person you are becoming\u201d \u2665\ufe0f\n\n \nhttps://www.instagram.com/goldenmdee/", "translator_type": "none", "protected": false, "verified": false, "followers_count": 5827, "friends_count": 5430, "listed_count": 0, "favourites_count": 22118, "statuses_count": 10078, "created_at": "Sun Feb 26 08:16:57 +0000 2012", "utc_offset": null, "time_zone": null, "geo_enabled": true, "lang": null, "contributors_enabled": false, "is_translator": false, "profile_background_color": "C0DEED", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_tile": false, "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "profile_image_url": "http://pbs.twimg.com/profile_images/1225124177075736577/xU452O6s_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1225124177075736577/xU452O6s_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/504191291/1586165128", "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null}, "geo": null, "coordinates": null, "place": null, "contributors": null, "retweeted_status": {"created_at": "Thu Jun 18 11:14:49 +0000 2020", "id": 1273574885122605061, "id_str": "1273574885122605061", "text": "If we have to remind you our bodies are ours ...we will...stop killing us\n#GBVmustfall #day84oflockdown #COVID19\u2026 https://t.co/UUk2uGJKui", "display_text_range": [0, 140], "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>", "truncated": true, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1249208580579954689, "id_str": "1249208580579954689", "name": "Green Eyed Raven", "screen_name": "GreenEyeRaven1", "location": null, "url": "http://onlyfans.com/greeneyedraven", "description": "Only Respond to DMs and private requests on ONLYFANS\ud83e\udd2d\nold account deleted at 30k\ud83d\udc94\nNude Model \u25cb\nContent Creator \u25cf", "translator_type": "none", "protected": false, "verified": false, "followers_count": 5688, "friends_count": 81, "listed_count": 11, "favourites_count": 189, "statuses_count": 249, "created_at": "Sun Apr 12 05:32:10 +0000 2020", "utc_offset": null, "time_zone": null, "geo_enabled": false, "lang": null, "contributors_enabled": false, "is_translator": false, "profile_background_color": "F5F8FA", "profile_background_image_url": "", "profile_background_image_url_https": "", "profile_background_tile": false, "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "profile_image_url": "http://pbs.twimg.com/profile_images/1273699570770419714/uM9rUKWw_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1273699570770419714/uM9rUKWw_normal.jpg", "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null}, "geo": null, "coordinates": null, "place": null, "contributors": null, "is_quote_status": false, "extended_tweet": {"full_text": "If we have to remind you our bodies are ours ...we will...stop killing us\n#GBVmustfall #day84oflockdown #COVID19 #ProtectBlackWomen #BlackLivesMatter https://t.co/hRWdSYCmkN", "display_text_range": [0, 149], "entities": {"hashtags": [{"text": "GBVmustfall", "indices": [74, 86]}, {"text": "day84oflockdown", "indices": [87, 103]}, {"text": "COVID19", "indices": [104, 112]}, {"text": "ProtectBlackWomen", "indices": [113, 131]}, {"text": "BlackLivesMatter", "indices": [132, 149]}], "urls": [], "user_mentions": [], "symbols": [], "media": [{"id": 1273574859147292673, "id_str": "1273574859147292673", "indices": [150, 173], "media_url": "http://pbs.twimg.com/media/EaylljJXsAEWa_9.jpg", "media_url_https": "https://pbs.twimg.com/media/EaylljJXsAEWa_9.jpg", "url": "https://t.co/hRWdSYCmkN", "display_url": "pic.twitter.com/hRWdSYCmkN", "expanded_url": "https://twitter.com/GreenEyeRaven1/status/1273574885122605061/photo/1", "type": "photo", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "medium": {"w": 1200, "h": 871, "resize": "fit"}, "large": {"w": 1600, "h": 1161, "resize": "fit"}, "small": {"w": 680, "h": 493, "resize": "fit"}}}, {"id": 1273574873634398208, "id_str": "1273574873634398208", "indices": [150, 173], "media_url": "http://pbs.twimg.com/media/EaylmZHXYAA__2Z.jpg", "media_url_https": "https://pbs.twimg.com/media/EaylmZHXYAA__2Z.jpg", "url": "https://t.co/hRWdSYCmkN", "display_url": "pic.twitter.com/hRWdSYCmkN", "expanded_url": "https://twitter.com/GreenEyeRaven1/status/1273574885122605061/photo/1", "type": "photo", "sizes": {"large": {"w": 864, "h": 1296, "resize": "fit"}, "thumb": {"w": 150, "h": 150, "resize": "crop"}, "medium": {"w": 800, "h": 1200, "resize": "fit"}, "small": {"w": 453, "h": 680, "resize": "fit"}}}]}, "extended_entities": {"media": [{"id": 1273574859147292673, "id_str": "1273574859147292673", "indices": [150, 173], "media_url": "http://pbs.twimg.com/media/EaylljJXsAEWa_9.jpg", "media_url_https": "https://pbs.twimg.com/media/EaylljJXsAEWa_9.jpg", "url": "https://t.co/hRWdSYCmkN", "display_url": "pic.twitter.com/hRWdSYCmkN", "expanded_url": "https://twitter.com/GreenEyeRaven1/status/1273574885122605061/photo/1", "type": "photo", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "medium": {"w": 1200, "h": 871, "resize": "fit"}, "large": {"w": 1600, "h": 1161, "resize": "fit"}, "small": {"w": 680, "h": 493, "resize": "fit"}}}, {"id": 1273574873634398208, "id_str": "1273574873634398208", "indices": [150, 173], "media_url": "http://pbs.twimg.com/media/EaylmZHXYAA__2Z.jpg", "media_url_https": "https://pbs.twimg.com/media/EaylmZHXYAA__2Z.jpg", "url": "https://t.co/hRWdSYCmkN", "display_url": "pic.twitter.com/hRWdSYCmkN", "expanded_url": "https://twitter.com/GreenEyeRaven1/status/1273574885122605061/photo/1", "type": "photo", "sizes": {"large": {"w": 864, "h": 1296, "resize": "fit"}, "thumb": {"w": 150, "h": 150, "resize": "crop"}, "medium": {"w": 800, "h": 1200, "resize": "fit"}, "small": {"w": 453, "h": 680, "resize": "fit"}}}]}}, "quote_count": 538, "reply_count": 1290, "retweet_count": 2466, "favorite_count": 10241, "entities": {"hashtags": [{"text": "GBVmustfall", "indices": [74, 86]}, {"text": "day84oflockdown", "indices": [87, 103]}, {"text": "COVID19", "indices": [104, 112]}], "urls": [{"url": "https://t.co/UUk2uGJKui", "expanded_url": "https://twitter.com/i/web/status/1273574885122605061", "display_url": "twitter.com/i/web/status/1\u2026", "indices": [114, 137]}], "user_mentions": [], "symbols": []}, "favorited": false, "retweeted": false, "possibly_sensitive": true, "filter_level": "low", "lang": "en"}, "is_quote_status": false, "quote_count": 0, "reply_count": 0, "retweet_count": 0, "favorite_count": 0, "entities": {"hashtags": [{"text": "GBVmustfall", "indices": [94, 106]}, {"text": "day84oflockdown", "indices": [107, 123]}, {"text": "COVID19", "indices": [124, 132]}], "urls": [], "user_mentions": [{"screen_name": "GreenEyeRaven1", "name": "Green Eyed Raven", "id": 1249208580579954689, "id_str": "1249208580579954689", "indices": [3, 18]}], "symbols": []}, "favorited": false, "retweeted": false, "filter_level": "low", "lang": "en", "timestamp_ms": "1592514008517"}
I'm getting the following error:
Error: org.apache.spark.sql.AnalysisException: Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).json(file).filter("_corrupt_record".isNotNull).count() and spark.read.schema(schema).json(file).select("_corrupt_record").show(). Instead, you can cache or save the parsed results and then send the same query. For example, val df = spark.read.schema(schema).json(file).cache() and then df.filter("_corrupt_record".isNotNull).count().;
Looking through this code, it seems that spark.read.schema(schema).etc
and other commands are not sparklyr commands. Has anyone experienced this issue before and found a way to deal with it? Thanks!