Midjourney 2022 - 250k [CSV]
Data Science and Analytics
Related Searches
Trusted By



![Midjourney 2022 - 250k [CSV] Dataset on Opendatabay data marketplace](/_next/image?url=https%3A%2F%2Fstorage.googleapis.com%2Fopendatabay_public%2Fe2bafc6d-9436-4dbd-ab19-2f16c73f09e7%2F3c8f3ad0-1749193623418.jpg&w=1200&q=75)
"No reviews yet"
Free
About
This data was reformatted from "Midjourney User Prompts & Generated Images (250k)". Give them a ^ +1 and this as well.
TLDR
250k.csv (248k rows) = Good for text search to show images. Contains re-runs, which could have similar output images.
reduced.csv (130k rows) = No re-runs, but may contain duplicate text with different arguments.
raw.csv (251k rows) = messages containing commands and probably unwanted content. URLs are full length.
img_url is the right end of the image URLs.
text is the text portion of the /imagine command, excluding arguments and input URLs.
wc -l *
251390 midjourney_2022_250k_raw.csv
248069 midjourney_2022_250k.csv
130407 midjourney_2022_reduced.csv
The URLs have this removed to save memory. Re-attach like so…
image: https://cdn.discordapp.com/attachments/ + img_url
image: https://media.discordapp.net/attachments/ + img_url
job page: https://www.midjourney.com/app/jobs/ + job_id
The raw version contains errors, chat, server messages, and other anomalies. It's not recommended unless you want to start over.
Mistake: "thumbnail" is just another URL.
Probably all the code you need.
import os,sys,re,glob,json
import pandas as pd
import numpy as np
import glob
from tqdm.notebook import tqdm
def extract_data(filename, ii=0):
with open(filename) as fh:
obj = json.loads(fh.read())
columns = [
# 'id', 'type',
'content', 'timestamp',
# 'channel_id', 'author',
'attachments',
# 'embeds', 'mentions', 'mention_roles', 'pinned', 'mention_everyone', 'tts', 'edited_timestamp', 'flags', 'components', 'message_reference', 'hit'
]
data = []
for xx_lst in obj['messages']:
for d_yy in xx_lst[:1]:
zz_lst = d_yy.get('attachments',[])
content = d_yy.get('content',None)
timestamp = d_yy.get('timestamp','')
for d_attach in zz_lst:
thumb_url = d_attach.get('url','')
img_url = d_attach.get('proxy_url','')
data.append([timestamp,content, thumb_url, img_url])
ii += 1
return data,ii
def expand_df(df):
df.drop_duplicates('img_url', inplace=True)
df['L'] = df._message.apply(len)
df = df[df.L>0]
def func(x):
if '*' not in x: return None
x = x.replace('\n',' ')
arr = re.findall(r'[*]{2,20}(.+)[*]{2,20}', x)
cmd = None
if len(arr)>0:
cmd = arr[0].replace('—','--')
return cmd
df['cmd'] = df._message.apply(func)
df = df[~df['cmd'].isnull()]
def func(x):
return x.replace('https://media.discordapp.net/attachments/','')
df['img_url'] = df.img_url.apply(func)
df = df[df.img_url.str.contains('png')]
def func(x):
job_ids = re.findall('[a-f0-9-]{36,36}',x)
return job_ids[0] if len(job_ids)>0 else None
df['job_id'] = df._thumb_url.apply(func)
df.drop_duplicates('job_id', inplace=True)
def func(x):
x = re.sub(r'[<](.+)[>]', '', x)
return x.split('--')[0].strip()
df['text'] = df['cmd'].apply(func)
return df.sample(frac=1.0)
files = glob.glob(f'{DATA_DIR}*.json')
ii = 0
data = []
for filename in tqdm(files):
dd,ii = extract_data(filename, ii)
data += dd
columns = 'timestamp,_message,thumb_url,img_url'.split(',')
df = pd.DataFrame(data, columns=columns)
df.to_csv(BASE_DIR+'midjourney_2022_250k_raw.csv')
df = expand_df(df)
df = df[[c for c in df.columns if len(c)>0 and str(c)[0]!='']]
df.to_csv(BASE_DIR+'midjourney_2022_250k.csv')
df = df.sample(frac=1.0).drop_duplicates(['cmd'])
df.sort_values('L', ascending=True, inplace=True)
df.to_csv(BASE_DIR+'midjourney_2022_reduced.csv')
Original Data Source: Midjourney 2022 - 250k [CSV]