Twitter Data is crawled and nested upto 6th level. Many columns are repeated at root level or within the nested level. The data is provided as below,
root
|-- contributors: string (nullable = true)
|-- coordinates: string (nullable = true)
|-- created_at: string (nullable = true)
|-- entities: struct (nullable = true)
| |-- hashtags: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- media: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- additional_media_info: struct (nullable = true)
| | | | |-- description: string (nullable = true)
| | | | |-- embeddable: boolean (nullable = true)
| | | | |-- monetizable: boolean (nullable = true)
| | | | |-- title: string (nullable = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- media_url: string (nullable = true)
| | | |-- media_url_https: string (nullable = true)
| | | |-- sizes: struct (nullable = true)
| | | | |-- large: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- medium: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- small: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- thumb: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | |-- source_status_id: long (nullable = true)
| | | |-- source_status_id_str: string (nullable = true)
| | | |-- source_user_id: long (nullable = true)
| | | |-- source_user_id_str: string (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- url: string (nullable = true)
| |-- symbols: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- urls: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- url: string (nullable = true)
| |-- user_mentions: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- id: long (nullable = true)
I can not understand the meaning of few of the columns. If possible explain by taking the reference from root level because the columns inside the deep level are repeated.
Generally we understand the meaning of data columns, but when it is repeated in different context in same database (from different root level) then it is cumbersome to understand the meaning. Hope I will get the proper answer.
Example of 1st record is mentioned as below,
{"created_at": "Mon Aug 27 19:45:27 +0000 2018", "id": 1034165013202829315, "id_str": "1034165013202829315", "text": "RT @5SOS: VALENTINE OFFICIAL VIDEO // 14.09.18 http://tq.com/8HTz4X8l3n", "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>", "truncated": false, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 327185598, "id_str": "327185598", "name": "duds \ud83d\udc8c", "screen_name": "brndnflowers", "location": "barbara \ud83d\udcab", "url": "http://www.tvtime.com/prettyodds", "description": "#1 jake peralta stan", "translator_type": "regular", "protected": false, "verified": false, "followers_count": 1895, "friends_count": 914, "listed_count": 28, "favourites_count": 3943, "statuses_count": 162975, "created_at": "Fri Jul 01 04:05:18 +0000 2011", "utc_offset": null, "time_zone": null, "geo_enabled": true, "lang": "pt", "contributors_enabled": false, "is_translator": false, "profile_background_color": "FFFFFF", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme17/bg.gif", "profile_background_image_url_http": "http://abs.twimg.com/images/themes/theme17/bg.gif", "profile_background_tile": true, "profile_link_color": "365956", "profile_sidebar_border_color": "FFFFFF", "profile_sidebar_fill_color": "FFFFFF", "profile_text_color": "000000", "profile_use_background_image": false, "profile_image_url": "http://pbs.twimg.com/profile_images/1032968341021974528/-PZPfru5_normal.jpg", "profile_image_url_http": "http://pbs.twimg.com/profile_images/1032968341021974528/-PZPfru5_normal.jpg", "profile_banner_url": "http://pbs.twimg.com/profile_banners/327185598/1533595802", "default_profile": false, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null}, "geo": null, "coordinates": null, "place": null, "contributors": null, "retweeted_status": {"created_at": "Mon Aug 27 19:44:52 +0000 2018", "id": 1034164865575931906, "id_str": "1034164865575931906", "text": "VALENTINE OFFICIAL VIDEO // 14.09.18 http://tq.com/8HTz4X8l3n", "display_text_range": [0, 36], "source": "<a href=\"http://studio.twitter.com\" rel=\"nofollow\">Media Studio</a>", "truncated": false, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 264107729, "id_str": "264107729", "name": "5 Seconds Of Summer", "screen_name": "5SOS", "location": "Sydney, Australia", "url": "http://5sos.com", "description": "GET #YOUNGBLOOD http://5sosuk.lnk.to/YoungbloodDeluxeTW // @ashton5sos @calum5sos @michael5sos @luke5sos @HiOrHeyRecords snap: wearefivesos", "translator_type": "regular", "protected": false, "verified": true, "followers_count": 12516981, "friends_count": 30957, "listed_count": 26888, "favourites_count": 5002, "statuses_count": 27196, "created_at": "Fri Mar 11 10:18:46 +0000 2011", "utc_offset": null, "time_zone": null, "geo_enabled": true, "lang": "en", "contributors_enabled": false, "is_translator": false, "profile_background_color": "FFFFFF", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_http": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_tile": false, "profile_link_color": "C21B1B", "profile_sidebar_border_color": "FFFFFF", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "profile_image_url": "http://pbs.twimg.com/profile_images/984153799173591043/TjsDcMti_normal.jpg", "profile_image_url_http": "http://pbs.twimg.com/profile_images/984153799173591043/TjsDcMti_normal.jpg", "profile_banner_url": "http://pbs.twimg.com/profile_banners/264107729/1523475560", "default_profile": false, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null}, "geo": null, "coordinates": null, "place": null, "contributors": null, "is_quote_status": false, "quote_count": 20, "reply_count": 120, "retweet_count": 718, "favorite_count": 1191, "entities": {"hashtags": [], "urls": [], "user_mentions": [], "symbols": [], "media": [{"id": 1034164221951520768, "id_str": "1034164221951520768", "indices": [37, 60], "additional_media_info": {"title": "", "description": "", "embeddable": true, "monetizable": false}, "media_url": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "media_url_http": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "url": "http://tq.com/8HTz4X8l3n", "display_url": "pic.twitter.com/8HTz4X8l3n", "expanded_url": "http://twitter.com/5SOS/status/1034164865575931906/video/1", "type": "photo", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "small": {"w": 680, "h": 680, "resize": "fit"}, "medium": {"w": 720, "h": 720, "resize": "fit"}, "large": {"w": 720, "h": 720, "resize": "fit"}}}]}, "extended_entities": {"media": [{"id": 1034164221951520768, "id_str": "1034164221951520768", "indices": [37, 60], "additional_media_info": {"title": "", "description": "", "embeddable": true, "monetizable": false}, "media_url": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "media_url_http": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "url": "http://tq.com/8HTz4X8l3n", "display_url": "pic.twitter.com/8HTz4X8l3n", "expanded_url": "http://twitter.com/5SOS/status/1034164865575931906/video/1", "type": "video", "video_info": {"aspect_ratio": [1, 1], "duration_millis": 15015, "variants": [{"bitrate": 288000, "content_type": "video/mp4", "url": "http://video.twimg.com/amplify_video/1034164221951520768/vid/240x240/oARVmpjwhwrXs84p.mp4?tag=8"}, {"content_type": "application/x-mpegURL", "url": "http://video.twimg.com/amplify_video/1034164221951520768/pl/a8YWcfqwnYX-HhAg.m3u8?tag=8"}, {"bitrate": 1280000, "content_type": "video/mp4", "url": "http://video.twimg.com/amplify_video/1034164221951520768/vid/720x720/45y13MNcWpRuiLHa.mp4?tag=8"}, {"bitrate": 832000, "content_type": "video/mp4", "url": "http://video.twimg.com/amplify_video/1034164221951520768/vid/480x480/kGw6ErvAq1GWFIfx.mp4?tag=8"}]}, "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "small": {"w": 680, "h": 680, "resize": "fit"}, "medium": {"w": 720, "h": 720, "resize": "fit"}, "large": {"w": 720, "h": 720, "resize": "fit"}}}]}, "favorited": false, "retweeted": false, "possibly_sensitive": false, "filter_level": "low", "lang": "en"}, "is_quote_status": false, "quote_count": 0, "reply_count": 0, "retweet_count": 0, "favorite_count": 0, "entities": {"hashtags": [], "urls": [], "user_mentions": [{"screen_name": "5SOS", "name": "5 Seconds Of Summer", "id": 264107729, "id_str": "264107729", "indices": [3, 8]}], "symbols": [], "media": [{"id": 1034164221951520768, "id_str": "1034164221951520768", "indices": [47, 70], "additional_media_info": {"title": "", "description": "", "embeddable": true, "monetizable": false}, "media_url": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "media_url_http": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "url": "http://tq.com/8HTz4X8l3n", "display_url": "pic.twitter.com/8HTz4X8l3n", "expanded_url": "http://twitter.com/5SOS/status/1034164865575931906/video/1", "type": "photo", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "small": {"w": 680, "h": 680, "resize": "fit"}, "medium": {"w": 720, "h": 720, "resize": "fit"}, "large": {"w": 720, "h": 720, "resize": "fit"}}, "source_status_id": 1034164865575931906, "source_status_id_str": "1034164865575931906", "source_user_id": 264107729, "source_user_id_str": "264107729"}]}, "extended_entities": {"media": [{"id": 1034164221951520768, "id_str": "1034164221951520768", "indices": [47, 70], "additional_media_info": {"title": "", "description": "", "embeddable": true, "monetizable": false}, "media_url": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "media_url_http": "http://pbs.twimg.com/amplify_video_thumb/1034164221951520768/img/wZQcH1z3eJ2lLdhU.jpg", "url": "http://tq.com/8HTz4X8l3n", "display_url": "pic.twitter.com/8HTz4X8l3n", "expanded_url": "http://twitter.com/5SOS/status/1034164865575931906/video/1", "type": "video", "video_info": {"aspect_ratio": [1, 1], "duration_millis": 15015, "variants": [{"bitrate": 288000, "content_type": "video/mp4", "url": "http://video.twimg.com/amplify_video/1034164221951520768/vid/240x240/oARVmpjwhwrXs84p.mp4?tag=8"}, {"content_type": "application/x-mpegURL", "url": "http://video.twimg.com/amplify_video/1034164221951520768/pl/a8YWcfqwnYX-HhAg.m3u8?tag=8"}, {"bitrate": 1280000, "content_type": "video/mp4", "url": "http://video.twimg.com/amplify_video/1034164221951520768/vid/720x720/45y13MNcWpRuiLHa.mp4?tag=8"}, {"bitrate": 832000, "content_type": "video/mp4", "url": "http://video.twimg.com/amplify_video/1034164221951520768/vid/480x480/kGw6ErvAq1GWFIfx.mp4?tag=8"}]}, "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "small": {"w": 680, "h": 680, "resize": "fit"}, "medium": {"w": 720, "h": 720, "resize": "fit"}, "large": {"w": 720, "h": 720, "resize": "fit"}}, "source_status_id": 1034164865575931906, "source_status_id_str": "1034164865575931906", "source_user_id": 264107729, "source_user_id_str": "264107729"}]}, "favorited": false, "retweeted": false, "possibly_sensitive": false, "filter_level": "low", "lang": "en", "timestamp_ms": "1535399127661"}
Schema is reproducible in R. If anyone want I can provide R code to see the details.